From 0112143fdaae0a6264d9e02355e9dc0ca4f7741c Mon Sep 17 00:00:00 2001
From: wm4 <wm4@nowhere>
Date: Tue, 17 Dec 2013 02:39:45 +0100
Subject: Split mpvcore/ into common/, misc/, bstr/

---
 misc/charset_conv.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 misc/charset_conv.h |  19 ++++
 misc/ring.c         | 138 +++++++++++++++++++++++++
 misc/ring.h         | 108 ++++++++++++++++++++
 4 files changed, 552 insertions(+)
 create mode 100644 misc/charset_conv.c
 create mode 100644 misc/charset_conv.h
 create mode 100644 misc/ring.c
 create mode 100644 misc/ring.h

(limited to 'misc')

diff --git a/misc/charset_conv.c b/misc/charset_conv.c
new file mode 100644
index 0000000000..fe396e8ef5
--- /dev/null
+++ b/misc/charset_conv.c
@@ -0,0 +1,287 @@
+/*
+ * This file is part of mpv.
+ *
+ * Based on code taken from libass (ISC license), which was originally part
+ * of MPlayer (GPL).
+ * Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com>
+ *
+ * mpv is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "config.h"
+
+#include "common/msg.h"
+
+#if HAVE_ENCA
+#include <enca.h>
+#endif
+
+#if HAVE_LIBGUESS
+#include <libguess.h>
+#endif
+
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
+
+#include "charset_conv.h"
+
+bool mp_charset_is_utf8(const char *user_cp)
+{
+    return user_cp && (strcasecmp(user_cp, "utf8") == 0 ||
+                       strcasecmp(user_cp, "utf-8") == 0);
+}
+
+// Split the string on ':' into components.
+// out_arr is at least max entries long.
+// Return number of out_arr entries filled.
+static int split_colon(const char *user_cp, int max, bstr *out_arr)
+{
+    if (!user_cp || max < 1)
+        return 0;
+
+    int count = 0;
+    while (1) {
+        const char *next = strchr(user_cp, ':');
+        if (next && max - count > 1) {
+            out_arr[count++] = (bstr){(char *)user_cp, next - user_cp};
+            user_cp = next + 1;
+        } else {
+            out_arr[count++] = (bstr){(char *)user_cp, strlen(user_cp)};
+            break;
+        }
+    }
+    return count;
+}
+
+// Returns true if user_cp implies that calling mp_charset_guess() on the
+// input data is required to determine the real codepage. This is the case
+// if user_cp is not a real iconv codepage, but a magic value that requests
+// for example ENCA charset auto-detection.
+bool mp_charset_requires_guess(const char *user_cp)
+{
+    bstr res[2] = {{0}};
+    int r = split_colon(user_cp, 2, res);
+    // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
+    // by default, plus a codepage that is used if the input is not UTF-8.
+    return bstrcasecmp0(res[0], "enca") == 0 ||
+           bstrcasecmp0(res[0], "guess") == 0 ||
+           (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
+           (r > 1 && bstrcasecmp0(res[0], "utf8") == 0);
+}
+
+#if HAVE_ENCA
+static const char *enca_guess(bstr buf, const char *language)
+{
+    if (!language || !language[0])
+        language = "__"; // neutral language
+
+    const char *detected_cp = NULL;
+
+    EncaAnalyser analyser = enca_analyser_alloc(language);
+    if (analyser) {
+        enca_set_termination_strictness(analyser, 0);
+        EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len);
+        const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV);
+        if (tmp && enc.charset != ENCA_CS_UNKNOWN)
+            detected_cp = tmp;
+        enca_analyser_free(analyser);
+    } else {
+        mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA doesn't know language '%s'\n",
+               language);
+        size_t langcnt;
+        const char **languages = enca_get_languages(&langcnt);
+        mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA supported languages:");
+        for (int i = 0; i < langcnt; i++)
+            mp_msg(MSGT_SUBREADER, MSGL_ERR, " %s", languages[i]);
+        mp_msg(MSGT_SUBREADER, MSGL_ERR, "\n");
+        free(languages);
+    }
+
+    return detected_cp;
+}
+#endif
+
+#if HAVE_LIBGUESS
+static const char *libguess_guess(bstr buf, const char *language)
+{
+    if (!language || !language[0] || strcmp(language, "help") == 0) {
+        mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: "
+               "japanese taiwanese chinese korean russian arabic turkish "
+               "greek hebrew polish baltic\n");
+        return NULL;
+    }
+
+    return libguess_determine_encoding(buf.start, buf.len, language);
+}
+#endif
+
+// Runs charset auto-detection on the input buffer, and returns the result.
+// If auto-detection fails, NULL is returned.
+// If user_cp doesn't refer to any known auto-detection (for example because
+// it's a real iconv codepage), user_cp is returned without even looking at
+// the buf data.
+const char *mp_charset_guess(bstr buf, const char *user_cp, int flags)
+{
+    if (!mp_charset_requires_guess(user_cp))
+        return user_cp;
+
+    // Do our own UTF-8 detection, because at least ENCA seems to get it
+    // wrong sometimes (suggested by divVerent).
+    int r = bstr_validate_utf8(buf);
+    if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
+        return "UTF-8";
+
+    bstr params[3] = {{0}};
+    split_colon(user_cp, 3, params);
+
+    bstr type = params[0];
+    char lang[100];
+    snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1]));
+    const char *fallback = params[2].start; // last item, already 0-terminated
+
+    const char *res = NULL;
+
+#if HAVE_ENCA
+    if (bstrcasecmp0(type, "enca") == 0)
+        res = enca_guess(buf, lang);
+#endif
+#if HAVE_LIBGUESS
+    if (bstrcasecmp0(type, "guess") == 0)
+        res = libguess_guess(buf, lang);
+#endif
+    if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
+        if (!fallback)
+            fallback = params[1].start; // must be already 0-terminated
+    }
+
+    if (res) {
+        mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n",
+               BSTR_P(type), res);
+    } else {
+        res = fallback;
+        mp_msg(MSGT_SUBREADER, MSGL_DBG2,
+               "Detection with %.*s failed: fallback to %s\n",
+               BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1");
+    }
+
+    if (!res && !(flags & MP_STRICT_UTF8))
+        res = "UTF-8-BROKEN";
+
+    return res;
+}
+
+// Convert the data in buf to UTF-8. The charset argument can be an iconv
+// codepage, a value returned by mp_charset_conv_guess(), or a special value
+// that triggers autodetection of the charset (e.g. using ENCA).
+// The auto-detection is the only difference to mp_iconv_to_utf8().
+//  buf: same as mp_iconv_to_utf8()
+//  user_cp: iconv codepage, special value, NULL
+//  flags: same as mp_iconv_to_utf8()
+//  returns: same as mp_iconv_to_utf8()
+bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags)
+{
+    return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp, flags), flags);
+}
+
+// Use iconv to convert buf to UTF-8.
+// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is
+// obviously no conversion required (e.g. if cp is "UTF-8").
+// Returns a newly allocated buffer if conversion is done and succeeds. The
+// buffer will be terminated with 0 for convenience (the terminating 0 is not
+// included in the returned length).
+// Free the returned buffer with talloc_free().
+//  buf: input data
+//  cp: iconv codepage (or NULL)
+//  flags: combination of MP_ICONV_* flags
+//  returns: buf (no conversion), .start==NULL (error), or allocated buffer
+bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags)
+{
+#if HAVE_ICONV
+    if (!cp || !cp[0] || mp_charset_is_utf8(cp))
+        return buf;
+
+    if (strcasecmp(cp, "ASCII") == 0)
+        return buf;
+
+    if (strcasecmp(cp, "UTF-8-BROKEN") == 0)
+        return bstr_sanitize_utf8_latin1(NULL, buf);
+
+    iconv_t icdsc;
+    if ((icdsc = iconv_open("UTF-8", cp)) == (iconv_t) (-1)) {
+        if (flags & MP_ICONV_VERBOSE)
+            mp_msg(MSGT_SUBREADER, MSGL_ERR,
+                   "Error opening iconv with codepage '%s'\n", cp);
+        goto failure;
+    }
+
+    size_t size = buf.len;
+    size_t osize = size;
+    size_t ileft = size;
+    size_t oleft = size - 1;
+
+    char *outbuf = talloc_size(NULL, osize);
+    char *ip = buf.start;
+    char *op = outbuf;
+
+    while (1) {
+        int clear = 0;
+        size_t rc;
+        if (ileft)
+            rc = iconv(icdsc, &ip, &ileft, &op, &oleft);
+        else {
+            clear = 1; // clear the conversion state and leave
+            rc = iconv(icdsc, NULL, NULL, &op, &oleft);
+        }
+        if (rc == (size_t) (-1)) {
+            if (errno == E2BIG) {
+                size_t offset = op - outbuf;
+                outbuf = talloc_realloc_size(NULL, outbuf, osize + size);
+                op = outbuf + offset;
+                osize += size;
+                oleft += size;
+            } else {
+                if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) {
+                    // This is intended for cases where the input buffer is cut
+                    // at a random byte position. If this happens in the middle
+                    // of the buffer, it should still be an error. We say it's
+                    // fine if the error is within 10 bytes of the end.
+                    if (ileft <= 10)
+                        break;
+                }
+                if (flags & MP_ICONV_VERBOSE) {
+                    mp_msg(MSGT_SUBREADER, MSGL_ERR,
+                           "Error recoding text with codepage '%s'\n", cp);
+                }
+                talloc_free(outbuf);
+                iconv_close(icdsc);
+                goto failure;
+            }
+        } else if (clear)
+            break;
+    }
+
+    iconv_close(icdsc);
+
+    outbuf[osize - oleft - 1] = 0;
+    return (bstr){outbuf, osize - oleft - 1};
+#endif
+
+failure:
+    return (bstr){0};
+}
diff --git a/misc/charset_conv.h b/misc/charset_conv.h
new file mode 100644
index 0000000000..c216ede2be
--- /dev/null
+++ b/misc/charset_conv.h
@@ -0,0 +1,19 @@
+#ifndef MP_CHARSET_CONV_H
+#define MP_CHARSET_CONV_H
+
+#include <stdbool.h>
+#include "bstr/bstr.h"
+
+enum {
+    MP_ICONV_VERBOSE = 1,       // print errors instead of failing silently
+    MP_ICONV_ALLOW_CUTOFF = 2,  // allow partial input data
+    MP_STRICT_UTF8 = 4,         // don't fall back to UTF-8-BROKEN when guessing
+};
+
+bool mp_charset_is_utf8(const char *user_cp);
+bool mp_charset_requires_guess(const char *user_cp);
+const char *mp_charset_guess(bstr buf, const char *user_cp, int flags);
+bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags);
+bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags);
+
+#endif
diff --git a/misc/ring.c b/misc/ring.c
new file mode 100644
index 0000000000..eb139c2cab
--- /dev/null
+++ b/misc/ring.c
@@ -0,0 +1,138 @@
+/*
+ * This file is part of mpv.
+ * Copyright (c) 2012 wm4
+ * Copyright (c) 2013 Stefano Pigozzi <stefano.pigozzi@gmail.com>
+ *
+ * mpv is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <inttypes.h>
+#include <libavutil/common.h>
+#include <assert.h>
+#include "talloc.h"
+#include "compat/atomics.h"
+#include "ring.h"
+
+struct mp_ring {
+    uint8_t  *buffer;
+
+    /* Positions of the first readable/writeable chunks. Do not read this
+     * fields but use the atomic private accessors `mp_ring_get_wpos`
+     * and `mp_ring_get_rpos`. */
+    uint32_t rpos, wpos;
+};
+
+static uint32_t mp_ring_get_wpos(struct mp_ring *buffer)
+{
+    mp_memory_barrier();
+    return buffer->wpos;
+}
+
+static uint32_t mp_ring_get_rpos(struct mp_ring *buffer)
+{
+    mp_memory_barrier();
+    return buffer->rpos;
+}
+
+struct mp_ring *mp_ring_new(void *talloc_ctx, int size)
+{
+    struct mp_ring *ringbuffer =
+        talloc_zero(talloc_ctx, struct mp_ring);
+
+    *ringbuffer = (struct mp_ring) {
+        .buffer = talloc_size(talloc_ctx, size),
+    };
+
+    return ringbuffer;
+}
+
+int mp_ring_drain(struct mp_ring *buffer, int len)
+{
+    int buffered  = mp_ring_buffered(buffer);
+    int drain_len = FFMIN(len, buffered);
+    mp_atomic_add_and_fetch(&buffer->rpos, drain_len);
+    mp_memory_barrier();
+    return drain_len;
+}
+
+int mp_ring_read(struct mp_ring *buffer, unsigned char *dest, int len)
+{
+    if (!dest) return mp_ring_drain(buffer, len);
+
+    int size     = mp_ring_size(buffer);
+    int buffered = mp_ring_buffered(buffer);
+    int read_len = FFMIN(len, buffered);
+    int read_ptr = mp_ring_get_rpos(buffer) % size;
+
+    int len1 = FFMIN(size - read_ptr, read_len);
+    int len2 = read_len - len1;
+
+    memcpy(dest, buffer->buffer + read_ptr, len1);
+    memcpy(dest + len1, buffer->buffer, len2);
+
+    mp_atomic_add_and_fetch(&buffer->rpos, read_len);
+    mp_memory_barrier();
+
+    return read_len;
+}
+
+int mp_ring_write(struct mp_ring *buffer, unsigned char *src, int len)
+{
+    int size      = mp_ring_size(buffer);
+    int free      = mp_ring_available(buffer);
+    int write_len = FFMIN(len, free);
+    int write_ptr = mp_ring_get_wpos(buffer) % size;
+
+    int len1 = FFMIN(size - write_ptr, write_len);
+    int len2 = write_len - len1;
+
+    memcpy(buffer->buffer + write_ptr, src, len1);
+    memcpy(buffer->buffer, src + len1, len2);
+
+    mp_atomic_add_and_fetch(&buffer->wpos, write_len);
+    mp_memory_barrier();
+
+    return write_len;
+}
+
+void mp_ring_reset(struct mp_ring *buffer)
+{
+    buffer->wpos = buffer->rpos = 0;
+    mp_memory_barrier();
+}
+
+int mp_ring_available(struct mp_ring *buffer)
+{
+    return mp_ring_size(buffer) - mp_ring_buffered(buffer);
+}
+
+int mp_ring_size(struct mp_ring *buffer)
+{
+    return talloc_get_size(buffer->buffer);
+}
+
+int mp_ring_buffered(struct mp_ring *buffer)
+{
+    return (mp_ring_get_wpos(buffer) - mp_ring_get_rpos(buffer));
+}
+
+char *mp_ring_repr(struct mp_ring *buffer, void *talloc_ctx)
+{
+    return talloc_asprintf(
+        talloc_ctx,
+        "Ringbuffer { .size = %dB, .buffered = %dB, .available = %dB }",
+        mp_ring_size(buffer),
+        mp_ring_buffered(buffer),
+        mp_ring_available(buffer));
+}
diff --git a/misc/ring.h b/misc/ring.h
new file mode 100644
index 0000000000..e93baea97e
--- /dev/null
+++ b/misc/ring.h
@@ -0,0 +1,108 @@
+/*
+ * This file is part of mpv.
+ * Copyright (c) 2012 wm4
+ * Copyright (c) 2013 Stefano Pigozzi <stefano.pigozzi@gmail.com>
+ *
+ * mpv is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with mpv. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MPV_MP_RING_H
+#define MPV_MP_RING_H
+
+/**
+ * A simple non-blocking SPSC (single producer, single consumer) ringbuffer
+ * implementation. Thread safety is accomplished through atomic operations.
+ */
+
+struct mp_ring;
+
+/**
+ * Instantiate a new ringbuffer
+ *
+ * talloc_ctx: talloc context of the newly created object
+ * size:       total size in bytes
+ * return:     the newly created ringbuffer
+ */
+struct mp_ring *mp_ring_new(void *talloc_ctx, int size);
+
+/**
+ * Read data from the ringbuffer
+ *
+ * buffer: target ringbuffer instance
+ * dest:   destination buffer for the read data. If NULL read data is discarded.
+ * len:    maximum number of bytes to read
+ * return: number of bytes read
+ */
+int mp_ring_read(struct mp_ring *buffer, unsigned char *dest, int len);
+
+/**
+ * Write data to the ringbuffer
+ *
+ * buffer: target ringbuffer instance
+ * src:    source buffer for the write data
+ * len:    maximum number of bytes to write
+ * return: number of bytes written
+ */
+int mp_ring_write(struct mp_ring *buffer, unsigned char *src, int len);
+
+/**
+ * Drain data from the ringbuffer
+ *
+ * buffer: target ringbuffer instance
+ * len:    maximum number of bytes to drain
+ * return: number of bytes drained
+ */
+int mp_ring_drain(struct mp_ring *buffer, int len);
+
+/**
+ * Reset the ringbuffer discarding any content
+ *
+ * buffer: target ringbuffer instance
+ */
+void mp_ring_reset(struct mp_ring *buffer);
+
+/**
+ * Get the available size for writing
+ *
+ * buffer: target ringbuffer instance
+ * return: number of bytes that can be written
+ */
+int mp_ring_available(struct mp_ring *buffer);
+
+/**
+ * Get the total size
+ *
+ * buffer: target ringbuffer instance
+ * return: total ringbuffer size in bytes
+ */
+int mp_ring_size(struct mp_ring *buffer);
+
+/**
+ * Get the available size for reading
+ *
+ * buffer: target ringbuffer instance
+ * return: number of bytes ready for reading
+ */
+int mp_ring_buffered(struct mp_ring *buffer);
+
+/**
+ * Get a string representation of the ringbuffer
+ *
+ * buffer:     target ringbuffer instance
+ * talloc_ctx: talloc context of the newly created string
+ * return:     string representing the ringbuffer
+ */
+char *mp_ring_repr(struct mp_ring *buffer, void *talloc_ctx);
+
+#endif
-- 
cgit v1.2.3