From 0112143fdaae0a6264d9e02355e9dc0ca4f7741c Mon Sep 17 00:00:00 2001 From: wm4 Date: Tue, 17 Dec 2013 02:39:45 +0100 Subject: Split mpvcore/ into common/, misc/, bstr/ --- misc/charset_conv.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++++++ misc/charset_conv.h | 19 ++++ misc/ring.c | 138 +++++++++++++++++++++++++ misc/ring.h | 108 ++++++++++++++++++++ 4 files changed, 552 insertions(+) create mode 100644 misc/charset_conv.c create mode 100644 misc/charset_conv.h create mode 100644 misc/ring.c create mode 100644 misc/ring.h (limited to 'misc') diff --git a/misc/charset_conv.c b/misc/charset_conv.c new file mode 100644 index 0000000000..fe396e8ef5 --- /dev/null +++ b/misc/charset_conv.c @@ -0,0 +1,287 @@ +/* + * This file is part of mpv. + * + * Based on code taken from libass (ISC license), which was originally part + * of MPlayer (GPL). + * Copyright (C) 2006 Evgeniy Stepanov + * + * mpv is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with mpv. If not, see . + */ + +#include +#include +#include + +#include "config.h" + +#include "common/msg.h" + +#if HAVE_ENCA +#include +#endif + +#if HAVE_LIBGUESS +#include +#endif + +#if HAVE_ICONV +#include +#endif + +#include "charset_conv.h" + +bool mp_charset_is_utf8(const char *user_cp) +{ + return user_cp && (strcasecmp(user_cp, "utf8") == 0 || + strcasecmp(user_cp, "utf-8") == 0); +} + +// Split the string on ':' into components. +// out_arr is at least max entries long. +// Return number of out_arr entries filled. +static int split_colon(const char *user_cp, int max, bstr *out_arr) +{ + if (!user_cp || max < 1) + return 0; + + int count = 0; + while (1) { + const char *next = strchr(user_cp, ':'); + if (next && max - count > 1) { + out_arr[count++] = (bstr){(char *)user_cp, next - user_cp}; + user_cp = next + 1; + } else { + out_arr[count++] = (bstr){(char *)user_cp, strlen(user_cp)}; + break; + } + } + return count; +} + +// Returns true if user_cp implies that calling mp_charset_guess() on the +// input data is required to determine the real codepage. This is the case +// if user_cp is not a real iconv codepage, but a magic value that requests +// for example ENCA charset auto-detection. +bool mp_charset_requires_guess(const char *user_cp) +{ + bstr res[2] = {{0}}; + int r = split_colon(user_cp, 2, res); + // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8 + // by default, plus a codepage that is used if the input is not UTF-8. + return bstrcasecmp0(res[0], "enca") == 0 || + bstrcasecmp0(res[0], "guess") == 0 || + (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) || + (r > 1 && bstrcasecmp0(res[0], "utf8") == 0); +} + +#if HAVE_ENCA +static const char *enca_guess(bstr buf, const char *language) +{ + if (!language || !language[0]) + language = "__"; // neutral language + + const char *detected_cp = NULL; + + EncaAnalyser analyser = enca_analyser_alloc(language); + if (analyser) { + enca_set_termination_strictness(analyser, 0); + EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len); + const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV); + if (tmp && enc.charset != ENCA_CS_UNKNOWN) + detected_cp = tmp; + enca_analyser_free(analyser); + } else { + mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA doesn't know language '%s'\n", + language); + size_t langcnt; + const char **languages = enca_get_languages(&langcnt); + mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA supported languages:"); + for (int i = 0; i < langcnt; i++) + mp_msg(MSGT_SUBREADER, MSGL_ERR, " %s", languages[i]); + mp_msg(MSGT_SUBREADER, MSGL_ERR, "\n"); + free(languages); + } + + return detected_cp; +} +#endif + +#if HAVE_LIBGUESS +static const char *libguess_guess(bstr buf, const char *language) +{ + if (!language || !language[0] || strcmp(language, "help") == 0) { + mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: " + "japanese taiwanese chinese korean russian arabic turkish " + "greek hebrew polish baltic\n"); + return NULL; + } + + return libguess_determine_encoding(buf.start, buf.len, language); +} +#endif + +// Runs charset auto-detection on the input buffer, and returns the result. +// If auto-detection fails, NULL is returned. +// If user_cp doesn't refer to any known auto-detection (for example because +// it's a real iconv codepage), user_cp is returned without even looking at +// the buf data. +const char *mp_charset_guess(bstr buf, const char *user_cp, int flags) +{ + if (!mp_charset_requires_guess(user_cp)) + return user_cp; + + // Do our own UTF-8 detection, because at least ENCA seems to get it + // wrong sometimes (suggested by divVerent). + int r = bstr_validate_utf8(buf); + if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) + return "UTF-8"; + + bstr params[3] = {{0}}; + split_colon(user_cp, 3, params); + + bstr type = params[0]; + char lang[100]; + snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); + const char *fallback = params[2].start; // last item, already 0-terminated + + const char *res = NULL; + +#if HAVE_ENCA + if (bstrcasecmp0(type, "enca") == 0) + res = enca_guess(buf, lang); +#endif +#if HAVE_LIBGUESS + if (bstrcasecmp0(type, "guess") == 0) + res = libguess_guess(buf, lang); +#endif + if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { + if (!fallback) + fallback = params[1].start; // must be already 0-terminated + } + + if (res) { + mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n", + BSTR_P(type), res); + } else { + res = fallback; + mp_msg(MSGT_SUBREADER, MSGL_DBG2, + "Detection with %.*s failed: fallback to %s\n", + BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1"); + } + + if (!res && !(flags & MP_STRICT_UTF8)) + res = "UTF-8-BROKEN"; + + return res; +} + +// Convert the data in buf to UTF-8. The charset argument can be an iconv +// codepage, a value returned by mp_charset_conv_guess(), or a special value +// that triggers autodetection of the charset (e.g. using ENCA). +// The auto-detection is the only difference to mp_iconv_to_utf8(). +// buf: same as mp_iconv_to_utf8() +// user_cp: iconv codepage, special value, NULL +// flags: same as mp_iconv_to_utf8() +// returns: same as mp_iconv_to_utf8() +bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags) +{ + return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp, flags), flags); +} + +// Use iconv to convert buf to UTF-8. +// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is +// obviously no conversion required (e.g. if cp is "UTF-8"). +// Returns a newly allocated buffer if conversion is done and succeeds. The +// buffer will be terminated with 0 for convenience (the terminating 0 is not +// included in the returned length). +// Free the returned buffer with talloc_free(). +// buf: input data +// cp: iconv codepage (or NULL) +// flags: combination of MP_ICONV_* flags +// returns: buf (no conversion), .start==NULL (error), or allocated buffer +bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags) +{ +#if HAVE_ICONV + if (!cp || !cp[0] || mp_charset_is_utf8(cp)) + return buf; + + if (strcasecmp(cp, "ASCII") == 0) + return buf; + + if (strcasecmp(cp, "UTF-8-BROKEN") == 0) + return bstr_sanitize_utf8_latin1(NULL, buf); + + iconv_t icdsc; + if ((icdsc = iconv_open("UTF-8", cp)) == (iconv_t) (-1)) { + if (flags & MP_ICONV_VERBOSE) + mp_msg(MSGT_SUBREADER, MSGL_ERR, + "Error opening iconv with codepage '%s'\n", cp); + goto failure; + } + + size_t size = buf.len; + size_t osize = size; + size_t ileft = size; + size_t oleft = size - 1; + + char *outbuf = talloc_size(NULL, osize); + char *ip = buf.start; + char *op = outbuf; + + while (1) { + int clear = 0; + size_t rc; + if (ileft) + rc = iconv(icdsc, &ip, &ileft, &op, &oleft); + else { + clear = 1; // clear the conversion state and leave + rc = iconv(icdsc, NULL, NULL, &op, &oleft); + } + if (rc == (size_t) (-1)) { + if (errno == E2BIG) { + size_t offset = op - outbuf; + outbuf = talloc_realloc_size(NULL, outbuf, osize + size); + op = outbuf + offset; + osize += size; + oleft += size; + } else { + if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) { + // This is intended for cases where the input buffer is cut + // at a random byte position. If this happens in the middle + // of the buffer, it should still be an error. We say it's + // fine if the error is within 10 bytes of the end. + if (ileft <= 10) + break; + } + if (flags & MP_ICONV_VERBOSE) { + mp_msg(MSGT_SUBREADER, MSGL_ERR, + "Error recoding text with codepage '%s'\n", cp); + } + talloc_free(outbuf); + iconv_close(icdsc); + goto failure; + } + } else if (clear) + break; + } + + iconv_close(icdsc); + + outbuf[osize - oleft - 1] = 0; + return (bstr){outbuf, osize - oleft - 1}; +#endif + +failure: + return (bstr){0}; +} diff --git a/misc/charset_conv.h b/misc/charset_conv.h new file mode 100644 index 0000000000..c216ede2be --- /dev/null +++ b/misc/charset_conv.h @@ -0,0 +1,19 @@ +#ifndef MP_CHARSET_CONV_H +#define MP_CHARSET_CONV_H + +#include +#include "bstr/bstr.h" + +enum { + MP_ICONV_VERBOSE = 1, // print errors instead of failing silently + MP_ICONV_ALLOW_CUTOFF = 2, // allow partial input data + MP_STRICT_UTF8 = 4, // don't fall back to UTF-8-BROKEN when guessing +}; + +bool mp_charset_is_utf8(const char *user_cp); +bool mp_charset_requires_guess(const char *user_cp); +const char *mp_charset_guess(bstr buf, const char *user_cp, int flags); +bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags); +bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags); + +#endif diff --git a/misc/ring.c b/misc/ring.c new file mode 100644 index 0000000000..eb139c2cab --- /dev/null +++ b/misc/ring.c @@ -0,0 +1,138 @@ +/* + * This file is part of mpv. + * Copyright (c) 2012 wm4 + * Copyright (c) 2013 Stefano Pigozzi + * + * mpv is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with mpv. If not, see . + */ + +#include +#include +#include +#include "talloc.h" +#include "compat/atomics.h" +#include "ring.h" + +struct mp_ring { + uint8_t *buffer; + + /* Positions of the first readable/writeable chunks. Do not read this + * fields but use the atomic private accessors `mp_ring_get_wpos` + * and `mp_ring_get_rpos`. */ + uint32_t rpos, wpos; +}; + +static uint32_t mp_ring_get_wpos(struct mp_ring *buffer) +{ + mp_memory_barrier(); + return buffer->wpos; +} + +static uint32_t mp_ring_get_rpos(struct mp_ring *buffer) +{ + mp_memory_barrier(); + return buffer->rpos; +} + +struct mp_ring *mp_ring_new(void *talloc_ctx, int size) +{ + struct mp_ring *ringbuffer = + talloc_zero(talloc_ctx, struct mp_ring); + + *ringbuffer = (struct mp_ring) { + .buffer = talloc_size(talloc_ctx, size), + }; + + return ringbuffer; +} + +int mp_ring_drain(struct mp_ring *buffer, int len) +{ + int buffered = mp_ring_buffered(buffer); + int drain_len = FFMIN(len, buffered); + mp_atomic_add_and_fetch(&buffer->rpos, drain_len); + mp_memory_barrier(); + return drain_len; +} + +int mp_ring_read(struct mp_ring *buffer, unsigned char *dest, int len) +{ + if (!dest) return mp_ring_drain(buffer, len); + + int size = mp_ring_size(buffer); + int buffered = mp_ring_buffered(buffer); + int read_len = FFMIN(len, buffered); + int read_ptr = mp_ring_get_rpos(buffer) % size; + + int len1 = FFMIN(size - read_ptr, read_len); + int len2 = read_len - len1; + + memcpy(dest, buffer->buffer + read_ptr, len1); + memcpy(dest + len1, buffer->buffer, len2); + + mp_atomic_add_and_fetch(&buffer->rpos, read_len); + mp_memory_barrier(); + + return read_len; +} + +int mp_ring_write(struct mp_ring *buffer, unsigned char *src, int len) +{ + int size = mp_ring_size(buffer); + int free = mp_ring_available(buffer); + int write_len = FFMIN(len, free); + int write_ptr = mp_ring_get_wpos(buffer) % size; + + int len1 = FFMIN(size - write_ptr, write_len); + int len2 = write_len - len1; + + memcpy(buffer->buffer + write_ptr, src, len1); + memcpy(buffer->buffer, src + len1, len2); + + mp_atomic_add_and_fetch(&buffer->wpos, write_len); + mp_memory_barrier(); + + return write_len; +} + +void mp_ring_reset(struct mp_ring *buffer) +{ + buffer->wpos = buffer->rpos = 0; + mp_memory_barrier(); +} + +int mp_ring_available(struct mp_ring *buffer) +{ + return mp_ring_size(buffer) - mp_ring_buffered(buffer); +} + +int mp_ring_size(struct mp_ring *buffer) +{ + return talloc_get_size(buffer->buffer); +} + +int mp_ring_buffered(struct mp_ring *buffer) +{ + return (mp_ring_get_wpos(buffer) - mp_ring_get_rpos(buffer)); +} + +char *mp_ring_repr(struct mp_ring *buffer, void *talloc_ctx) +{ + return talloc_asprintf( + talloc_ctx, + "Ringbuffer { .size = %dB, .buffered = %dB, .available = %dB }", + mp_ring_size(buffer), + mp_ring_buffered(buffer), + mp_ring_available(buffer)); +} diff --git a/misc/ring.h b/misc/ring.h new file mode 100644 index 0000000000..e93baea97e --- /dev/null +++ b/misc/ring.h @@ -0,0 +1,108 @@ +/* + * This file is part of mpv. + * Copyright (c) 2012 wm4 + * Copyright (c) 2013 Stefano Pigozzi + * + * mpv is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with mpv. If not, see . + */ + +#ifndef MPV_MP_RING_H +#define MPV_MP_RING_H + +/** + * A simple non-blocking SPSC (single producer, single consumer) ringbuffer + * implementation. Thread safety is accomplished through atomic operations. + */ + +struct mp_ring; + +/** + * Instantiate a new ringbuffer + * + * talloc_ctx: talloc context of the newly created object + * size: total size in bytes + * return: the newly created ringbuffer + */ +struct mp_ring *mp_ring_new(void *talloc_ctx, int size); + +/** + * Read data from the ringbuffer + * + * buffer: target ringbuffer instance + * dest: destination buffer for the read data. If NULL read data is discarded. + * len: maximum number of bytes to read + * return: number of bytes read + */ +int mp_ring_read(struct mp_ring *buffer, unsigned char *dest, int len); + +/** + * Write data to the ringbuffer + * + * buffer: target ringbuffer instance + * src: source buffer for the write data + * len: maximum number of bytes to write + * return: number of bytes written + */ +int mp_ring_write(struct mp_ring *buffer, unsigned char *src, int len); + +/** + * Drain data from the ringbuffer + * + * buffer: target ringbuffer instance + * len: maximum number of bytes to drain + * return: number of bytes drained + */ +int mp_ring_drain(struct mp_ring *buffer, int len); + +/** + * Reset the ringbuffer discarding any content + * + * buffer: target ringbuffer instance + */ +void mp_ring_reset(struct mp_ring *buffer); + +/** + * Get the available size for writing + * + * buffer: target ringbuffer instance + * return: number of bytes that can be written + */ +int mp_ring_available(struct mp_ring *buffer); + +/** + * Get the total size + * + * buffer: target ringbuffer instance + * return: total ringbuffer size in bytes + */ +int mp_ring_size(struct mp_ring *buffer); + +/** + * Get the available size for reading + * + * buffer: target ringbuffer instance + * return: number of bytes ready for reading + */ +int mp_ring_buffered(struct mp_ring *buffer); + +/** + * Get a string representation of the ringbuffer + * + * buffer: target ringbuffer instance + * talloc_ctx: talloc context of the newly created string + * return: string representing the ringbuffer + */ +char *mp_ring_repr(struct mp_ring *buffer, void *talloc_ctx); + +#endif -- cgit v1.2.3