summaryrefslogtreecommitdiffstats
path: root/TOOLS/lib/Parse/Matroska/Reader.pm
diff options
context:
space:
mode:
Diffstat (limited to 'TOOLS/lib/Parse/Matroska/Reader.pm')
-rw-r--r--TOOLS/lib/Parse/Matroska/Reader.pm426
1 files changed, 0 insertions, 426 deletions
diff --git a/TOOLS/lib/Parse/Matroska/Reader.pm b/TOOLS/lib/Parse/Matroska/Reader.pm
deleted file mode 100644
index 614b7b12c0..0000000000
--- a/TOOLS/lib/Parse/Matroska/Reader.pm
+++ /dev/null
@@ -1,426 +0,0 @@
-use 5.008;
-use strict;
-use warnings;
-
-# ABSTRACT: a low-level reader for EBML files
-package Parse::Matroska::Reader;
-
-use Parse::Matroska::Definitions qw{elem_by_hexid};
-use Parse::Matroska::Element;
-
-use Carp;
-use Scalar::Util qw{openhandle weaken};
-use IO::Handle;
-use IO::File;
-use List::Util qw{first};
-use Encode;
-
-use constant BIGINT_TRY => 'Pari,GMP,FastCalc';
-use Math::BigInt try => BIGINT_TRY;
-use Math::BigRat try => BIGINT_TRY;
-
-=head1 SYNOPSIS
-
- use Parse::Matroska::Reader;
- my $reader = Parse::Matroska::Reader->new($path);
- $reader->close;
- $reader->open(\$string_with_matroska_data);
-
- my $elem = $reader->read_element;
- print "Element ID: $elem->{elid}\n";
- print "Element name: $elem->{name}\n";
- if ($elem->{type} ne 'sub') {
- print "Element value: $elem->get_value\n";
- } else {
- while (my $child = $elem->next_child) {
- print "Child element: $child->{name}\n";
- }
- }
- $reader->close;
-
-=head1 DESCRIPTION
-
-Reads EBML data, which is used in Matroska files.
-This is a low-level reader which is meant to be used as a backend
-for higher level readers. TODO: write the high level readers :)
-
-=head1 NOTE
-
-The API of this module is not yet considered stable.
-
-=method new
-
-Creates a new reader.
-Calls L</open($arg)> with its arguments if provided.
-
-=cut
-sub new {
- my $class = shift;
- my $self = {};
- bless $self, $class;
-
- $self->open(@_) if @_;
- return $self;
-}
-
-=method open($arg)
-
-Creates the internal filehandle. The argument can be:
-
-=for :list
-* An open filehandle or L<IO::Handle> object.
-The filehandle is not C<dup()>ed, so calling L</close> in this
-object will close the given filehandle as well.
-* A scalar containing a path to a file.
-* On perl v5.14 or newer, a scalarref pointing to EBML data.
-For similar functionality in older perls, give an L<IO::String> object
-or the handle to an already C<open>ed scalarref.
-
-=cut
-sub open {
- my ($self, $arg) = @_;
- $self->{fh} = openhandle($arg) || IO::File->new($arg, "<:raw")
- or croak "Can't open $arg: $!";
-}
-
-=method close
-
-Closes the internal filehandle.
-
-=cut
-sub close {
- my ($self) = @_;
- $self->{fh}->close;
- delete $self->{fh};
-}
-
-# equivalent to $self->readlen(1), possibly faster
-sub _getc {
- my ($self) = @_;
- my $c = $self->{fh}->getc;
- croak "Can't do read of length 1: $!" if !defined $c && $!;
- return $c;
-}
-
-=method readlen($length)
-
-Reads C<$length> bytes from the internal filehandle.
-
-=cut
-sub readlen {
- my ($self, $len) = @_;
- my $data;
- my $readlen = $self->{fh}->read($data, $len);
- croak "Can't do read of length $len: $!"
- unless defined $readlen;
- return $data;
-}
-
-# converts a byte string into an integer
-# we do so by converting the integer into a hex string (big-endian)
-# and then reading the hex-string into an integer
-sub _bin2int($) {
- my ($bin) = @_;
- # if the length is larger than 3
- # the resulting integer might be larger than INT_MAX
- if (length($bin) > 3) {
- return Math::BigInt->from_hex(unpack("H*", $bin));
- }
- return hex(unpack("H*", $bin));
-}
-
-# creates a floating-point number with the given mantissa and exponent
-sub _ldexp {
- my ($mantissa, $exponent) = @_;
- my $r = new Math::BigRat($mantissa);
- return $r * Math::BigRat->new(2)**$exponent;
-}
-
-# NOTE: the read_* functions are hard to read because they're ports
-# of even harder to read python functions.
-# TODO: make them readable
-
-=method read_id
-
-Reads an EBML ID atom in hexadecimal string format, suitable
-for passing to L<Parse::Matroska::Definitions/elem_by_hexid($id)>.
-
-=cut
-sub read_id {
- my ($self) = @_;
- my $t = $self->_getc;
- return undef unless defined $t;
- my $i = 0;
- my $mask = 1<<7;
-
- if (ord($t) == 0) {
- croak "Matroska Syntax error: first byte of ID was \\0"
- }
- until (ord($t) & $mask) {
- ++$i;
- $mask >>= 1;
- }
- # return hex string of the bytes we just read
- return unpack "H*", ($t . $self->readlen($i));
-}
-
-=method read_size
-
-Reads an EBML Data Size atom, which immediately follows
-an EBML ID atom.
-
-This returns an array consisting of:
-
-=for :list
-0. The length of the Data Size atom.
-1. The value encoded in the Data Size atom, which is the length of all the data following it.
-
-=cut
-sub read_size {
- my ($self) = @_;
- my $t = $self->_getc;
- my $i = 0;
- my $mask = 1<<7;
-
- if (ord($t) == 0) {
- croak "Matroska Syntax error: first byte of data size was \\0"
- }
- until (ord($t) & $mask) {
- ++$i;
- $mask >>= 1;
- }
- $t = $t & chr($mask-1); # strip length bits (keep only significant bits)
- return ($i+1, _bin2int $t . $self->readlen($i));
-}
-
-=method read_str($length)
-
-Reads a string of length C<$length> bytes from the internal filehandle.
-The string is already L<Encode/decode>d from C<UTF-8>, which is the
-standard Matroska string encoding.
-
-=cut
-{
- my $utf8 = find_encoding("UTF-8");
- sub read_str {
- my ($self, $length) = @_;
- return $utf8->decode($self->readlen($length));
- }
-}
-
-=method read_uint($length)
-
-Reads an unsigned integer of length C<$length> bytes
-from the internal filehandle.
-
-Returns a L<Math::BigInt> object if C<$length> is greater
-than 4.
-
-=cut
-sub read_uint {
- my ($self, $length) = @_;
- return _bin2int $self->readlen($length);
-}
-
-=method read_sint($length)
-
-Reads a signed integer of length C<$length> bytes
-from the internal filehandle.
-
-Returns a L<Math::BigInt> object if C<$length> is greater
-than 4.
-
-=cut
-sub read_sint {
- my ($self, $length) = @_;
- my $i = $self->read_uint($length);
-
- # Apply 2's complement to the unsigned int
- my $mask = int(2 ** ($length * 8 - 1));
- # if the most significant bit is set...
- if ($i & $mask) {
- # subtract the MSB twice
- $i -= 2 * $mask;
- }
- return $i;
-}
-
-=method read_float($length)
-
-Reads an IEEE floating point number of length C<$length>
-bytes from the internal filehandle.
-
-Only lengths C<4> and C<8> are supported (C C<float> and C<double>).
-
-=cut
-{
- my $b1 = new Math::BigInt 1;
-
- sub read_float {
- my ($self, $length) = @_;
- my $i = new Math::BigInt $self->read_uint($length)->bstr;
- my $f;
-
- # These evil expressions reinterpret an unsigned int as IEEE binary floats
- if ($length == 4) {
- $f = _ldexp(($i & ((1<<23) - 1)) + (1<<23), ($i>>23 & ((1<<8) - 1)) - 150);
- $f = -$f if $i & ($b1<<31);
- } elsif ($length == 8) {
- $f = _ldexp(($i & (($b1<<52) - 1)) + ($b1<<52), ($i>>52 & ((1<<12) - 1)) - 1075);
- $f = -$f if $i & ($b1<<63);
- } else {
- croak "Matroska Syntax error: unsupported IEEE float byte size $length";
- }
-
- return $f;
- }
-}
-
-=method read_ebml_id($length)
-
-Reads an EBML ID when it's encoded as the data inside another
-EBML element, that is, when the enclosing element's C<type> is
-C<ebml_id>.
-
-This returns a hashref with the EBML element description as
-defined in L<Parse::Matroska::Definitions>.
-
-=cut
-sub read_ebml_id {
- my ($self, $length) = @_;
- return elem_by_hexid(unpack("H*", $self->readlen($length)));
-}
-
-=method skip($length)
-
-Skips C<$length> bytes in the internal filehandle.
-
-=cut
-sub skip {
- my ($self, $len) = @_;
- return if $self->{fh}->can('seek') && $self->{fh}->seek($len, 1);
- $self->readlen($len);
- return;
-}
-
-=method getpos
-
-Wrapper for L<IO::Seekable/$io-E<gt>getpos> in the internal filehandle.
-
-Returns undef if the internal filehandle can't C<getpos>.
-
-=cut
-sub getpos {
- my ($self) = @_;
- return undef unless $self->{fh}->can('getpos');
- return $self->{fh}->getpos;
-}
-
-=method setpos($pos)
-
-Wrapper for L<IO::Seekable/$io-E<gt>setpos> in the internal filehandle.
-
-Returns C<undef> if the internal filehandle can't C<setpos>.
-
-Croaks if C<setpos> does not seek to the requested position,
-that is, if calling C<getpos> does not yield the same object
-as the C<$pos> argument.
-
-=cut
-sub setpos {
- my ($self, $pos) = @_;
- return undef unless $pos && $self->{fh}->can('setpos');
-
- my $ret = $self->{fh}->setpos($pos);
- croak "Cannot seek to correct position"
- unless $self->getpos eq $pos;
- return $ret;
-}
-
-=method read_element($read_bin)
-
-Reads a full EBML element from the internal filehandle.
-
-Returns a L<Parse::Matroska::Element> object initialized with
-the read data. If C<read_bin> is not present or is false, will
-delay-load the contents of C<binary> type elements, that is,
-they will only be loaded when calling C<get_value> on the
-returned L<Parse::Matroska::Element> object.
-
-Does not read the children of the element if its type is
-C<sub>. Look into the L<Parse::Matroska::Element> interface
-for details in how to read children elements.
-
-Pass a true C<$read_bin> if the stream being read is not
-seekable (C<getpos> is undef) and the contents of C<binary>
-elements is desired, otherwise seeking errors or internal
-filehandle corruption might occur.
-
-=cut
-sub read_element {
- my ($self, $read_bin) = @_;
- return undef if $self->{fh}->eof;
-
- my $elem_pos = $self->getpos;
-
- my $elid = $self->read_id;
- my $elem_def = elem_by_hexid($elid);
- my ($size_len, $content_len) = $self->read_size;
- my $full_len = length($elid)/2 + $size_len + $content_len;
-
- my $elem = Parse::Matroska::Element->new(
- elid => $elid,
- name => $elem_def && $elem_def->{name},
- type => $elem_def && $elem_def->{valtype},
- size_len => $size_len,
- content_len => $content_len,
- full_len => $full_len,
- reader => $self,
- elem_pos => $elem_pos,
- data_pos => $self->getpos,
- );
- weaken($elem->{reader});
-
- if (defined $elem_def) {
- if ($elem->{type} eq 'sub') {
- $elem->{value} = [];
- } elsif ($elem->{type} eq 'str') {
- $elem->{value} = $self->read_str($content_len);
- } elsif ($elem->{type} eq 'ebml_id') {
- $elem->{value} = $self->read_ebml_id($content_len);
- } elsif ($elem->{type} eq 'uint') {
- $elem->{value} = $self->read_uint($content_len);
- } elsif ($elem->{type} eq 'sint') {
- $elem->{value} = $self->read_sint($content_len);
- } elsif ($elem->{type} eq 'float') {
- $elem->{value} = $self->read_float($content_len);
- } elsif ($elem->{type} eq 'skip') {
- $self->skip($content_len);
- } elsif ($elem->{type} eq 'binary') {
- if ($read_bin) {
- $elem->{value} = $self->readlen($content_len);
- } else {
- $self->skip($content_len);
- }
- } else {
- die "Matroska Definition error: type $elem->{valtype} unknown"
- }
- } else {
- $self->skip($content_len);
- }
- return $elem;
-}
-
-1;
-
-=head1 CAVEATS
-
-Children elements have to be processed as soon as an element
-with children is found, or their children ignored with
-L<Parse::Matroska::Element/skip>. Not doing so doesn't cause
-errors but results in an invalid structure, with constant C<0>
-depth.
-
-To work correctly in unseekable streams, either the contents
-of C<binary>-type elements has to be ignored or the C<read_bin>
-flag to C<read_element> has to be true.