From b20e08ed94c850b419ed9b9e1736cf32da71afc7 Mon Sep 17 00:00:00 2001 From: reimar Date: Sun, 28 Feb 2010 15:24:30 +0000 Subject: Extend stream_read_line to support reading lines from UTF-16 encoded files and use this to support reading UTF-16 encoded subtitle files in subreader.c git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@30799 b3059339-0415-0410-9bf9-f77b7e298cf2 --- Changelog | 1 + stream/stream.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- stream/stream.h | 2 +- subreader.c | 95 ++++++++++++++++++++++++++------------------------ 4 files changed, 153 insertions(+), 51 deletions(-) diff --git a/Changelog b/Changelog index fa6f29fed3..67344a262b 100644 --- a/Changelog +++ b/Changelog @@ -36,6 +36,7 @@ MPlayer (1.0) * support for displaying subs in the terminal (FIXME) * support for subtitles with audio-only files * support for right-to-left languages with embedded subtitles + * support for UTF-16 encoded external subtitles * support for 8 channel audio * sync dvd:// and dvdnav:// features * support for MPEG-4 ASP in VDPAU video output (non B-frame only) diff --git a/stream/stream.c b/stream/stream.c index 47d4b9e2a5..83cfdb02c8 100644 --- a/stream/stream.c +++ b/stream/stream.c @@ -41,6 +41,7 @@ #include "network.h" #include "stream.h" #include "libmpdemux/demuxer.h" +#include "libavutil/intreadwrite.h" #include "m_option.h" #include "m_struct.h" @@ -488,9 +489,103 @@ int stream_check_interrupt(int time) { return stream_check_interrupt_cb(time); } -unsigned char* stream_read_line(stream_t *s,unsigned char* mem, int max) { +/** + * Helper function to read 16 bits little-endian and advance pointer + */ +static uint16_t get_le16_inc(const uint8_t **buf) +{ + uint16_t v = AV_RL16(*buf); + *buf += 2; + return v; +} + +/** + * Helper function to read 16 bits big-endian and advance pointer + */ +static uint16_t get_be16_inc(const uint8_t **buf) +{ + uint16_t v = AV_RB16(*buf); + *buf += 2; + return v; +} + +/** + * Find a newline character in buffer + * \param buf buffer to search + * \param len amount of bytes to search in buffer, may not overread + * \param utf16 chose between UTF-8/ASCII/other and LE and BE UTF-16 + * 0 = UTF-8/ASCII/other, 1 = UTF-16-LE, 2 = UTF-16-BE + */ +static const uint8_t *find_newline(const uint8_t *buf, int len, int utf16) +{ + uint32_t c; + const uint8_t *end = buf + len; + switch (utf16) { + case 0: + return (uint8_t *)memchr(buf, '\n', len); + case 1: + while (buf < end - 1) { + GET_UTF16(c, buf < end - 1 ? get_le16_inc(&buf) : 0, return NULL;) + if (buf <= end && c == '\n') + return buf - 1; + } + break; + case 2: + while (buf < end - 1) { + GET_UTF16(c, buf < end - 1 ? get_be16_inc(&buf) : 0, return NULL;) + if (buf <= end && c == '\n') + return buf - 1; + } + break; + } + return NULL; +} + +/** + * Copy a number of bytes, converting to UTF-8 if input is UTF-16 + * \param dst buffer to copy to + * \param dstsize size of dst buffer + * \param src buffer to copy from + * \param len amount of bytes to copy from src + * \param utf16 chose between UTF-8/ASCII/other and LE and BE UTF-16 + * 0 = UTF-8/ASCII/other, 1 = UTF-16-LE, 2 = UTF-16-BE + */ +static int copy_characters(uint8_t *dst, int dstsize, + const uint8_t *src, int *len, int utf16) +{ + uint32_t c; + uint8_t *dst_end = dst + dstsize; + const uint8_t *end = src + *len; + switch (utf16) { + case 0: + if (*len > dstsize) + *len = dstsize; + memcpy(dst, src, *len); + return *len; + case 1: + while (src < end - 1 && dst_end - dst > 8) { + uint8_t tmp; + GET_UTF16(c, src < end - 1 ? get_le16_inc(&src) : 0, ;) + PUT_UTF8(c, tmp, *dst++ = tmp;) + } + *len -= end - src; + return dstsize - (dst_end - dst); + case 2: + while (src < end - 1 && dst_end - dst > 8) { + uint8_t tmp; + GET_UTF16(c, src < end - 1 ? get_be16_inc(&src) : 0, ;) + PUT_UTF8(c, tmp, *dst++ = tmp;) + } + *len -= end - src; + return dstsize - (dst_end - dst); + } + return 0; +} + +unsigned char* stream_read_line(stream_t *s,unsigned char* mem, int max, int utf16) { int len; - unsigned char* end,*ptr = mem; + const unsigned char *end; + unsigned char *ptr = mem; if (max < 1) return NULL; max--; // reserve one for 0-termination do { @@ -499,13 +594,14 @@ unsigned char* stream_read_line(stream_t *s,unsigned char* mem, int max) { if(len <= 0 && (!cache_stream_fill_buffer(s) || (len = s->buf_len-s->buf_pos) <= 0)) break; - end = (unsigned char*) memchr((void*)(s->buffer+s->buf_pos),'\n',len); + end = find_newline(s->buffer+s->buf_pos, len, utf16); if(end) len = end - (s->buffer+s->buf_pos) + 1; if(len > 0 && max > 0) { - int l = len > max ? max : len; - memcpy(ptr,s->buffer+s->buf_pos,l); + int l = copy_characters(ptr, max, s->buffer+s->buf_pos, &len, utf16); max -= l; ptr += l; + if (!len) + break; } s->buf_pos += len; } while(!end); diff --git a/stream/stream.h b/stream/stream.h index cc70c81db5..7774651611 100644 --- a/stream/stream.h +++ b/stream/stream.h @@ -265,7 +265,7 @@ inline static int stream_read(stream_t *s,char* mem,int total){ return total; } -unsigned char* stream_read_line(stream_t *s,unsigned char* mem, int max); +unsigned char* stream_read_line(stream_t *s,unsigned char* mem, int max, int utf16); inline static int stream_eof(stream_t *s){ return s->eof; diff --git a/subreader.c b/subreader.c index 4f8782353d..0169925727 100644 --- a/subreader.c +++ b/subreader.c @@ -111,7 +111,7 @@ static char *stristr(const char *haystack, const char *needle) { return NULL; } -static subtitle *sub_read_line_sami(stream_t* st, subtitle *current) { +static subtitle *sub_read_line_sami(stream_t* st, subtitle *current, int utf16) { static char line[LINE_LEN+1]; static char *s = NULL, *slacktime_s; char text[LINE_LEN+1], *p=NULL, *q; @@ -123,7 +123,7 @@ static subtitle *sub_read_line_sami(stream_t* st, subtitle *current) { /* read the first line */ if (!s) - if (!(s = stream_read_line(st, line, LINE_LEN))) return 0; + if (!(s = stream_read_line(st, line, LINE_LEN, utf16))) return 0; do { switch (state) { @@ -230,7 +230,7 @@ static subtitle *sub_read_line_sami(stream_t* st, subtitle *current) { } /* read next line */ - if (state != 99 && !(s = stream_read_line (st, line, LINE_LEN))) { + if (state != 99 && !(s = stream_read_line (st, line, LINE_LEN, utf16))) { if (current->start > 0) { break; // if it is the last subtitle } else { @@ -274,14 +274,14 @@ static char *sub_readtext(char *source, char **dest) { else return NULL; // last text field } -static subtitle *sub_read_line_microdvd(stream_t *st,subtitle *current) { +static subtitle *sub_read_line_microdvd(stream_t *st,subtitle *current, int utf16) { char line[LINE_LEN+1]; char line2[LINE_LEN+1]; char *p, *next; int i; do { - if (!stream_read_line (st, line, LINE_LEN)) return NULL; + if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; } while ((sscanf (line, "{%ld}{}%[^\r\n]", &(current->start), line2) < 2) && @@ -302,14 +302,14 @@ static subtitle *sub_read_line_microdvd(stream_t *st,subtitle *current) { return current; } -static subtitle *sub_read_line_mpl2(stream_t *st,subtitle *current) { +static subtitle *sub_read_line_mpl2(stream_t *st,subtitle *current, int utf16) { char line[LINE_LEN+1]; char line2[LINE_LEN+1]; char *p, *next; int i; do { - if (!stream_read_line (st, line, LINE_LEN)) return NULL; + if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; } while ((sscanf (line, "[%ld][%ld]%[^\r\n]", &(current->start), &(current->end), line2) < 3)); @@ -328,19 +328,19 @@ static subtitle *sub_read_line_mpl2(stream_t *st,subtitle *current) { return current; } -static subtitle *sub_read_line_subrip(stream_t* st, subtitle *current) { +static subtitle *sub_read_line_subrip(stream_t* st, subtitle *current, int utf16) { char line[LINE_LEN+1]; int a1,a2,a3,a4,b1,b2,b3,b4; char *p=NULL, *q=NULL; int len; while (1) { - if (!stream_read_line (st, line, LINE_LEN)) return NULL; + if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; if (sscanf (line, "%d:%d:%d.%d,%d:%d:%d.%d",&a1,&a2,&a3,&a4,&b1,&b2,&b3,&b4) < 8) continue; current->start = a1*360000+a2*6000+a3*100+a4; current->end = b1*360000+b2*6000+b3*100+b4; - if (!stream_read_line (st, line, LINE_LEN)) return NULL; + if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; p=q=line; for (current->lines=1; current->lines < SUB_MAX_TEXT; current->lines++) { @@ -358,21 +358,21 @@ static subtitle *sub_read_line_subrip(stream_t* st, subtitle *current) { return current; } -static subtitle *sub_read_line_subviewer(stream_t *st,subtitle *current) { +static subtitle *sub_read_line_subviewer(stream_t *st,subtitle *current, int utf16) { char line[LINE_LEN+1]; int a1,a2,a3,a4,b1,b2,b3,b4; char *p=NULL; int i,len; while (!current->text[0]) { - if (!stream_read_line (st, line, LINE_LEN)) return NULL; + if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; if ((len=sscanf (line, "%d:%d:%d%[,.:]%d --> %d:%d:%d%[,.:]%d",&a1,&a2,&a3,(char *)&i,&a4,&b1,&b2,&b3,(char *)&i,&b4)) < 10) continue; current->start = a1*360000+a2*6000+a3*100+a4/10; current->end = b1*360000+b2*6000+b3*100+b4/10; for (i=0; itext[0]) { - if (!stream_read_line (st, line, LINE_LEN)) return NULL; + if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; if (line[0]!='{') continue; if ((len=sscanf (line, "{T %d:%d:%d:%d",&a1,&a2,&a3,&a4)) < 4) continue; current->start = a1*360000+a2*6000+a3*100+a4/10; for (i=0; itext[0]) { - if (!stream_read_line (st, line, LINE_LEN)) return NULL; + if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; if ((len=sscanf (line, "%d:%d:%d%c%n",&a1,&a2,&a3,&separator,&plen)) < 4) continue; @@ -489,7 +489,7 @@ static subtitle *sub_read_line_vplayer(stream_t *st,subtitle *current) { return current; } -static subtitle *sub_read_line_rt(stream_t *st,subtitle *current) { +static subtitle *sub_read_line_rt(stream_t *st,subtitle *current, int utf16) { //TODO: This format uses quite rich (sub/super)set of xhtml // I couldn't check it since DTD is not included. // WARNING: full XML parses can be required for proper parsing @@ -499,7 +499,7 @@ static subtitle *sub_read_line_rt(stream_t *st,subtitle *current) { int i,len,plen; while (!current->text[0]) { - if (!stream_read_line (st, line, LINE_LEN)) return NULL; + if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; //TODO: it seems that format of time is not easily determined, it may be 1:12, 1:12.0 or 0:1:12.0 //to describe the same moment in time. Maybe there are even more formats in use. //if ((len=sscanf (line, "