diff options
Diffstat (limited to 'TOOLS/lib/Parse/Matroska/Reader.pm')
-rw-r--r-- | TOOLS/lib/Parse/Matroska/Reader.pm | 423 |
1 files changed, 423 insertions, 0 deletions
diff --git a/TOOLS/lib/Parse/Matroska/Reader.pm b/TOOLS/lib/Parse/Matroska/Reader.pm new file mode 100644 index 0000000000..47e67ce5f7 --- /dev/null +++ b/TOOLS/lib/Parse/Matroska/Reader.pm @@ -0,0 +1,423 @@ +use 5.008; +use strict; +use warnings; + +# ABSTRACT: a low-level reader for EBML files +package Parse::Matroska::Reader; + +use Parse::Matroska::Definitions qw{elem_by_hexid}; +use Parse::Matroska::Element; + +use Carp; +use Scalar::Util qw{openhandle weaken}; +use IO::Handle; +use IO::File; +use List::Util qw{first}; +use Encode; + +use constant BIGINT_TRY => 'Pari,GMP,FastCalc'; +use Math::BigInt try => BIGINT_TRY; +use Math::BigRat try => BIGINT_TRY; + +=head1 SYNOPSIS + + use Parse::Matroska::Reader; + my $reader = Parse::Matroska::Reader->new($path); + $reader->close; + $reader->open(\$string_with_matroska_data); + + my $elem = $reader->read_element; + print "Element ID: $elem->{elid}\n"; + print "Element name: $elem->{name}\n"; + if ($elem->{type} ne 'sub') { + print "Element value: $elem->get_value\n"; + } else { + while (my $child = $elem->next_child) { + print "Child element: $child->{name}\n"; + } + } + $reader->close; + +=head1 DESCRIPTION + +Reads EBML data, which is used in Matroska files. +This is a low-level reader which is meant to be used as a backend +for higher level readers. TODO: write the high level readers :) + +=head1 NOTE + +The API of this module is not yet considered stable. + +=method new + +Creates a new reader. +Calls L</open($arg)> with its arguments if provided. + +=cut +sub new { + my $class = shift; + my $self = {}; + bless $self, $class; + + $self->open(@_) if @_; + return $self; +} + +=method open($arg) + +Creates the internal filehandle. The argument can be: + +=for :list +* An open filehandle or L<IO::Handle> object. +The filehandle is not C<dup()>ed, so calling L</close> in this +object will close the given filehandle as well. +* A scalar containing a path to a file. +* On perl v5.14 or newer, a scalarref pointing to EBML data. +For similar functionality in older perls, give an L<IO::String> object +or the handle to an already C<open>ed scalarref. + +=cut +sub open { + my ($self, $arg) = @_; + $self->{fh} = openhandle($arg) || IO::File->new($arg, "<:raw") + or croak "Can't open $arg: $!"; +} + +=method close + +Closes the internal filehandle. + +=cut +sub close { + my ($self) = @_; + $self->{fh}->close; + delete $self->{fh}; +} + +# equivalent to $self->readlen(1), possibly faster +sub _getc { + my ($self) = @_; + my $c = $self->{fh}->getc; + croak "Can't do read of length 1: $!" if !defined $c && $!; + return $c; +} + +=method readlen($length) + +Reads C<$length> bytes from the internal filehandle. + +=cut +sub readlen { + my ($self, $len) = @_; + my $data; + my $readlen = $self->{fh}->read($data, $len); + croak "Can't do read of length $len: $!" + unless defined $readlen; + return $data; +} + +# converts a byte string into an integer +# we do so by converting the integer into a hex string (big-endian) +# and then reading the hex-string into an integer +sub _bin2int($) { + my ($bin) = @_; + # if the length is larger than 3 + # the resulting integer might be larger than INT_MAX + if (length($bin) > 3) { + return Math::BigInt->from_hex(unpack("H*", $bin)); + } + return hex(unpack("H*", $bin)); +} + +# creates a floating-point number with the given mantissa and exponent +sub _ldexp { + my ($mantissa, $exponent) = @_; + return $mantissa * Math::BigRat->new(2)**$exponent; +} + +# NOTE: the read_* functions are hard to read because they're ports +# of even harder to read python functions. +# TODO: make them readable + +=method read_id + +Reads an EBML ID atom in hexadecimal string format, suitable +for passing to L<Parse::Matroska::Definitions/elem_by_hexid($id)>. + +=cut +sub read_id { + my ($self) = @_; + my $t = $self->_getc; + return undef unless defined $t; + my $i = 0; + my $mask = 1<<7; + + if (ord($t) == 0) { + croak "Matroska Syntax error: first byte of ID was \\0" + } + until (ord($t) & $mask) { + ++$i; + $mask >>= 1; + } + # return hex string of the bytes we just read + return unpack "H*", ($t . $self->readlen($i)); +} + +=method read_size + +Reads an EBML Data Size atom, which immediately follows +an EBML ID atom. + +This returns an array consisting of: + +=for :list +0. The length of the Data Size atom. +1. The value encoded in the Data Size atom, which is the length of all the data following it. + +=cut +sub read_size { + my ($self) = @_; + my $t = $self->_getc; + my $i = 0; + my $mask = 1<<7; + + if (ord($t) == 0) { + croak "Matroska Syntax error: first byte of data size was \\0" + } + until (ord($t) & $mask) { + ++$i; + $mask >>= 1; + } + $t = $t & chr($mask-1); # strip length bits (keep only significant bits) + return ($i+1, _bin2int $t . $self->readlen($i)); +} + +=method read_str($length) + +Reads a string of length C<$length> bytes from the internal filehandle. +The string is already L<Encode/decode>d from C<UTF-8>, which is the +standard Matroska string encoding. + +=cut +{ + my $utf8 = find_encoding("UTF-8"); + sub read_str { + my ($self, $length) = @_; + return $utf8->decode($self->readlen($length)); + } +} + +=method read_uint($length) + +Reads an unsigned integer of length C<$length> bytes +from the internal filehandle. + +Returns a L<Math::BigInt> object if C<$length> is greater +than 4. + +=cut +sub read_uint { + my ($self, $length) = @_; + return _bin2int $self->readlen($length); +} + +=method read_sint($length) + +Reads a signed integer of length C<$length> bytes +from the internal filehandle. + +Returns a L<Math::BigInt> object if C<$length> is greater +than 4. + +=cut +sub read_sint { + my ($self, $length) = @_; + my $i = $self->read_uint($length); + + # Apply 2's complement to the unsigned int + my $mask = int(2 ** ($length * 8 - 1)); + # if the most significant bit is set... + if ($i & $mask) { + # subtract the MSB twice + $i -= 2 * $mask; + } + return $i; +} + +=method read_float($length) + +Reads an IEEE floating point number of length C<$length> +bytes from the internal filehandle. + +Only lengths C<4> and C<8> are supported (C C<float> and C<double>). + +=cut +sub read_float { + my ($self, $length) = @_; + my $i = $self->read_uint($length); + my $f; + + use bigrat try => BIGINT_TRY; + + # These evil expressions reinterpret an unsigned int as IEEE binary floats + if ($length == 4) { + $f = _ldexp(($i & (1<<23 - 1)) + (1<<23), ($i>>23 & (1<<8 - 1)) - 150); + $f = -$f if $i & (1<<31); + } elsif ($length == 8) { + $f = _ldexp(($i & (1<<52 - 1)) + (1<<52), ($i>>52 & (1<<12 - 1)) - 1075); + $f = -$f if $i & (1<<63); + } else { + croak "Matroska Syntax error: unsupported IEEE float byte size $length"; + } + + return $f; +} + +=method read_ebml_id($length) + +Reads an EBML ID when it's encoded as the data inside another +EBML element, that is, when the enclosing element's C<type> is +C<ebml_id>. + +This returns a hashref with the EBML element description as +defined in L<Parse::Matroska::Definitions>. + +=cut +sub read_ebml_id { + my ($self, $length) = @_; + return elem_by_hexid(unpack("H*", $self->readlen($length))); +} + +=method skip($length) + +Skips C<$length> bytes in the internal filehandle. + +=cut +sub skip { + my ($self, $len) = @_; + return if $self->{fh}->can('seek') && $self->{fh}->seek($len, 1); + $self->readlen($len); + return; +} + +=method getpos + +Wrapper for L<IO::Seekable/$io-E<gt>getpos> in the internal filehandle. + +Returns undef if the internal filehandle can't C<getpos>. + +=cut +sub getpos { + my ($self) = @_; + return undef unless $self->{fh}->can('getpos'); + return $self->{fh}->getpos; +} + +=method setpos($pos) + +Wrapper for L<IO::Seekable/$io-E<gt>setpos> in the internal filehandle. + +Returns C<undef> if the internal filehandle can't C<setpos>. + +Croaks if C<setpos> does not seek to the requested position, +that is, if calling C<getpos> does not yield the same object +as the C<$pos> argument. + +=cut +sub setpos { + my ($self, $pos) = @_; + return undef unless $pos && $self->{fh}->can('setpos'); + + my $ret = $self->{fh}->setpos($pos); + croak "Cannot seek to correct position" + unless $self->getpos eq $pos; + return $ret; +} + +=method read_element($read_bin) + +Reads a full EBML element from the internal filehandle. + +Returns a L<Parse::Matroska::Element> object initialized with +the read data. If C<read_bin> is not present or is false, will +delay-load the contents of C<binary> type elements, that is, +they will only be loaded when calling C<get_value> on the +returned L<Parse::Matroska::Element> object. + +Does not read the children of the element if its type is +C<sub>. Look into the L<Parse::Matroska::Element> interface +for details in how to read children elements. + +Pass a true C<$read_bin> if the stream being read is not +seekable (C<getpos> is undef) and the contents of C<binary> +elements is desired, otherwise seeking errors or internal +filehandle corruption might occur. + +=cut +sub read_element { + my ($self, $read_bin) = @_; + return undef if $self->{fh}->eof; + + my $elem_pos = $self->getpos; + + my $elid = $self->read_id; + my $elem_def = elem_by_hexid($elid); + my ($size_len, $content_len) = $self->read_size; + my $full_len = length($elid)/2 + $size_len + $content_len; + + my $elem = Parse::Matroska::Element->new( + elid => $elid, + name => $elem_def && $elem_def->{name}, + type => $elem_def && $elem_def->{valtype}, + size_len => $size_len, + content_len => $content_len, + full_len => $full_len, + reader => $self, + elem_pos => $elem_pos, + data_pos => $self->getpos, + ); + weaken($elem->{reader}); + + if (defined $elem_def) { + if ($elem->{type} eq 'sub') { + $elem->{value} = []; + } elsif ($elem->{type} eq 'str') { + $elem->{value} = $self->read_str($content_len); + } elsif ($elem->{type} eq 'ebml_id') { + $elem->{value} = $self->read_ebml_id($content_len); + } elsif ($elem->{type} eq 'uint') { + $elem->{value} = $self->read_uint($content_len); + } elsif ($elem->{type} eq 'sint') { + $elem->{value} = $self->read_sint($content_len); + } elsif ($elem->{type} eq 'float') { + $elem->{value} = $self->read_float($content_len); + } elsif ($elem->{type} eq 'skip') { + $self->skip($content_len); + } elsif ($elem->{type} eq 'binary') { + if ($read_bin) { + $elem->{value} = $self->readlen($content_len); + } else { + $self->skip($content_len); + } + } else { + die "Matroska Definition error: type $elem->{valtype} unknown" + } + } else { + $self->skip($content_len); + } + return $elem; +} + +1; + +=head1 CAVEATS + +Children elements have to be processed as soon as an element +with children is found, or their children ignored with +L<Parse::Matroska::Element/skip>. Not doing so doesn't cause +errors but results in an invalid structure, with constant C<0> +depth. + +To work correctly in unseekable streams, either the contents +of C<binary>-type elements has to be ignored or the C<read_bin> +flag to C<read_element> has to be true. |