From fae73079310eef9dce9737f2e37ff4b80c8830ee Mon Sep 17 00:00:00 2001
From: Kovensky <diogomfranco@gmail.com>
Date: Wed, 7 Nov 2012 11:49:44 -0300
Subject: Port several python scripts to Perl

file2string.pl and vdpau_functions.pl are direct ports.
matroska.py was reimplemented as the Parse::Matroska module in CPAN,
and matroska.pl was made a client of Parse::Matroska.
A copy of Parse::Matroska is included in TOOLS/lib, and matroska.pl
looks there first when trying to load the module.

osxbundle.py was not ported since I have no means to verify it.
Python is always available on OSX though, so there is no harm in
removing the check for it on configure.
---
 TOOLS/lib/Parse/Matroska.pm             |  30 +++
 TOOLS/lib/Parse/Matroska/Definitions.pm | 350 ++++++++++++++++++++++++++
 TOOLS/lib/Parse/Matroska/Element.pm     | 331 +++++++++++++++++++++++++
 TOOLS/lib/Parse/Matroska/Reader.pm      | 423 ++++++++++++++++++++++++++++++++
 TOOLS/lib/Parse/Matroska/Utils.pm       |  37 +++
 5 files changed, 1171 insertions(+)
 create mode 100644 TOOLS/lib/Parse/Matroska.pm
 create mode 100644 TOOLS/lib/Parse/Matroska/Definitions.pm
 create mode 100644 TOOLS/lib/Parse/Matroska/Element.pm
 create mode 100644 TOOLS/lib/Parse/Matroska/Reader.pm
 create mode 100644 TOOLS/lib/Parse/Matroska/Utils.pm

(limited to 'TOOLS/lib')
diff --git a/TOOLS/lib/Parse/Matroska.pm b/TOOLS/lib/Parse/Matroska.pm
new file mode 100644
index 0000000000..e1c08c9814
--- /dev/null
+++ b/TOOLS/lib/Parse/Matroska.pm
@@ -0,0 +1,30 @@
+use 5.008;
+use strict;
+use warnings;
+
+# ABSTRACT: Module collection to parse Matroska files.
+package Parse::Matroska;
+
+=head1 DESCRIPTION
+
+C<use>s L<Parse::Matroska::Reader>. See the documentation
+of the modules mentioned in L</"SEE ALSO"> for more information
+in how to use this module.
+
+It's intended for this module to contain high-level interfaces
+to the other modules in the distribution.
+
+=head1 SOURCE CODE
+
+L<https://github.com/Kovensky/Parse-Matroska>
+
+=head1 SEE ALSO
+
+L<Parse::Matroska::Reader>, L<Parse::Matroska::Element>,
+L<Parse::Matroska::Definitions>.
+
+=cut
+
+use Parse::Matroska::Reader;
+
+1;
diff --git a/TOOLS/lib/Parse/Matroska/Definitions.pm b/TOOLS/lib/Parse/Matroska/Definitions.pm
new file mode 100644
index 0000000000..9b700a7d20
--- /dev/null
+++ b/TOOLS/lib/Parse/Matroska/Definitions.pm
@@ -0,0 +1,350 @@
+use 5.008;
+use strict;
+use warnings;
+
+# ABSTRACT: internal EBML grammar definitions
+package Parse::Matroska::Definitions;
+
+use Parse::Matroska::Utils qw{uniq uncamelize};
+
+use Exporter;
+our @ISA       = qw{Exporter};
+our @EXPORT_OK = qw{elem_by_hexid %EBML_DEFINITION %MATROSKA_DEFINITION};
+
+=head1 SYNOPSIS
+
+    use Parse::Matroska::Definitions qw{elem_by_hexid};
+    my $ebml_id = elem_by_hexid('1a45dfa3');
+    print "EBML ID $ebml_id->{elid}'s name: $ebml_id->{name}";
+
+=head1 DESCRIPTION
+
+Contains the definition of the EBML grammar as expected in
+Matroska files. This module is meant mostly for internal use.
+
+As this was extended from a script in mpv-player, some data
+generated is apparently useless for regular module users
+but is still relevant to the mpv-player script. Such data
+is annotated as being for mpv compatibility.
+
+=head1 NOTE
+
+The API of this module is not yet considered stable.
+
+=head1 GLOBALS
+
+These global variables are considered B<immutable>.
+
+=head2 @Parse::Matroska::Definitions::global_elem_list
+
+A global list of known matroska elements. Useful for
+mpv's matroska script, used for generating C headers
+that parse matroska.
+
+=head2 %Parse::Matroska::Definitions::global_elem_dict
+
+A global hash of known matroska elements. Used internally
+by L</elem_by_hexid($id)>.
+
+=cut
+
+@Parse::Matroska::Definitions::global_elem_list = ();
+%Parse::Matroska::Definitions::global_elem_dict = ();
+
+=head2 %EBML_DEFINITION
+
+Optionally-importable hash of known EBML IDs belonging
+to the EBML generic grammar.
+
+=head2 %MATROSKA_DEFINITION
+
+Optionally-importable hash of known EBML IDs belonging
+to the Matroska-specific grammar.
+
+=cut
+
+our %EBML_DEFINITION = define_ebml();
+our %MATROSKA_DEFINITION = define_matroska();
+
+=method elem_by_hexid($id)
+
+Returns an EBML Element Definition corresponding to the provided
+hexadecimal string. Returns C<undef> if the element is unknown.
+
+=cut
+sub elem_by_hexid {
+    my ($elid) = @_;
+    return $Parse::Matroska::Definitions::global_elem_dict{$elid};
+}
+
+################################################
+### Helper functions for document definition ###
+################################################
+
+# used by elem when setting the 'valname' key
+use constant TYPE_MAP => {
+    uint    => 'uint64_t',
+    str     => 'struct bstr',
+    binary  => 'struct bstr',
+    ebml_id => 'uint32_t',
+    float   => 'double',
+    sint    => 'int64_t',
+};
+
+# this will be localized to "MATROSKA" or "EBML" on the elem declarations
+our $ELEM_DEFINE_TYPE = undef;
+
+=method elem($name,$elid,$valtype)
+
+NOTE: never call this function yourself; it changes data structures
+that are considered immutable outside of this package.
+
+Internal API function that generates the EBML Element Definitions.
+
+This API function returns an array which first element is C<$elid>
+and the second is a generated hash. The generated hash is stored
+in the @global_elem_list and %global_elem_dict.
+
+The generated hash contains:
+
+=for :list
+= name
+The EBML Element's name, given through C<$name>.
+= elid
+The EBML Element's hex id, given through C<$elid>. Used for lookups by L</elem_by_hexid($id)>.
+= valtype
+The EBML Element's type, given through C<$valtype>, except when C<$valtype> is an arrayref.
+= multiple
+If C<$name> ends with a C<*>, this is set as true and strips the C<*> from L</name>. Used to
+mark elements that may be repeated.
+= subelements
+An arrayref of elements that may be children of this element, given through C<$valtype> if it
+is an arrayref. Sets L</valtype> to C<sub> if there are subelements.
+= subids
+An arrayref listing all the L</elid>s of subelements, C<uniq>ified.
+
+The following elements are for mpv compatibility:
+
+=for :list
+= definename
+Name used for generating C #defines.
+= fieldname
+Name used for generating C struct fields.
+= structname
+Name used for generating C struct names.
+= ebmltype
+A pre-#defined constant to describe the element's type.
+= valname
+Typename used when declaring a struct field referring to this element.
+
+=cut
+sub elem {
+    my %e = (name => shift, elid => shift, valtype => shift);
+
+    # strip * from name, set 'multiple' if there was one
+    $e{multiple} = scalar $e{name} =~ s/\*$//;
+
+    # ELEM_DEFINE_TYPE is either MATROSKA or EBML
+    $e{definename} = "${ELEM_DEFINE_TYPE}_ID_".uc($e{name});
+    $e{fieldname} = uncamelize $e{name};
+    $e{structname} = "ebml_$e{fieldname}";
+
+    if (ref $e{valtype} eq 'HASH') {
+        $e{subelements} = $e{valtype};
+        $e{subids} = uniq map { $_->{elid} } values %{$e{subelements}};
+        $e{valtype} = 'sub';
+        $e{ebmltype} = 'EBML_TYPE_SUBELEMENTS';
+        $e{valname} = "struct $e{structname}";
+    } else {
+        $e{ebmltype} = "EBML_TYPE_\U$e{valtype}";
+        die "Unrecognized value type $e{valtype}" unless
+            defined ($e{valname} = TYPE_MAP->{$e{valtype}});
+    }
+    my $e = \%e;
+    push @Parse::Matroska::Definitions::global_elem_list, $e;
+    $Parse::Matroska::Definitions::global_elem_dict{$e{elid}} = $e;
+    return ($e{elid}, $e);
+}
+
+#############################################
+### EBML and Matroska document definitons ###
+#############################################
+
+=method define_ebml
+
+Internal function that defines the EBML generic grammar.
+
+Must not be called from outside the package.
+
+=cut
+sub define_ebml {
+    local $ELEM_DEFINE_TYPE = 'EBML';
+    return (
+        elem('EBML', '1a45dfa3', {
+            elem('EBMLVersion',        '4286', 'uint'),
+            elem('EBMLReadVersion',    '42f7', 'uint'),
+            elem('EBMLMaxIDLength',    '42f2', 'uint'),
+            elem('EBMLMaxSizeLength',  '42f3', 'uint'),
+            elem('DocType',            '4282', 'str'),
+            elem('DocTypeVersion',     '4287', 'uint'),
+            elem('DocTypeReadVersion', '4285', 'uint'),
+        }),
+
+        elem('CRC32',      'bf', 'binary'),
+        elem('Void',       'ec', 'binary'),
+    );
+}
+
+
+=method define_matroska
+
+Internal function that defines the Matroska-specific EBML grammar.
+
+Must not be called from outside the package.
+
+=cut
+sub define_matroska {
+    local $ELEM_DEFINE_TYPE = 'MATROSKA';
+    return (
+        elem('Segment', '18538067', {
+            elem('SeekHead*', '114d9b74', {
+                elem('Seek*', '4dbb', {
+                    elem('SeekID',       '53ab', 'ebml_id'),
+                    elem('SeekPosition', '53ac', 'uint'),
+                }),
+            }),
+
+            elem('Info*', '1549a966', {
+                elem('SegmentUID',      '73a4', 'binary'),
+                elem('PrevUID',       '3cb923', 'binary'),
+                elem('NextUID',       '3eb923', 'binary'),
+                elem('TimecodeScale', '2ad7b1', 'uint'),
+                elem('DateUTC',         '4461', 'sint'),
+                elem('Title',           '7ba9', 'str'),
+                elem('MuxingApp',       '4d80', 'str'),
+                elem('WritingApp',      '5741', 'str'),
+                elem('Duration',        '4489', 'float'),
+            }),
+
+            elem('Cluster*', '1f43b675', {
+                elem('Timecode', 'e7', 'uint'),
+                elem('BlockGroup*', 'a0', {
+                    elem('Block',           'a1', 'binary'),
+                    elem('BlockDuration',   '9b', 'uint'),
+                    elem('ReferenceBlock*', 'fb', 'sint'),
+                }),
+                elem('SimpleBlock*', 'a3', 'binary'),
+            }),
+
+            elem('Tracks*', '1654ae6b', {
+                elem('TrackEntry*', 'ae', {
+                    elem('TrackNumber',            'd7', 'uint'),
+                    elem('TrackUID',             '73c5', 'uint'),
+                    elem('TrackType',              '83', 'uint'),
+                    elem('FlagEnabled',            'b9', 'uint'),
+                    elem('FlagDefault',            '88', 'uint'),
+                    elem('FlagForced',           '55aa', 'uint'),
+                    elem('FlagLacing',             '9c', 'uint'),
+                    elem('MinCache',             '6de7', 'uint'),
+                    elem('MaxCache',             '6df8', 'uint'),
+                    elem('DefaultDuration',    '23e383', 'uint'),
+                    elem('TrackTimecodeScale', '23314f', 'float'),
+                    elem('MaxBlockAdditionID',   '55ee', 'uint'),
+                    elem('Name',                 '536e', 'str'),
+                    elem('Language',           '22b59c', 'str'),
+                    elem('CodecID',                '86', 'str'),
+                    elem('CodecPrivate',         '63a2', 'binary'),
+                    elem('CodecName',          '258688', 'str'),
+                    elem('CodecDecodeAll',         'aa', 'uint'),
+                    elem('Video', 'e0', {
+                        elem('FlagInterlaced',  '9a', 'uint'),
+                        elem('PixelWidth',      'b0', 'uint'),
+                        elem('PixelHeight',     'ba', 'uint'),
+                        elem('DisplayWidth',  '54b0', 'uint'),
+                        elem('DisplayHeight', '54ba', 'uint'),
+                        elem('DisplayUnit',   '54b2', 'uint'),
+                        elem('FrameRate',   '2383e3', 'float'),
+                    }),
+                    elem('Audio', 'e1', {
+                        elem('SamplingFrequency',         'b5', 'float'),
+                        elem('OutputSamplingFrequency', '78b5', 'float'),
+                        elem('Channels',                  '9f', 'uint'),
+                        elem('BitDepth',                '6264', 'uint'),
+                    }),
+                    elem('ContentEncodings', '6d80', {
+                        elem('ContentEncoding*', '6240', {
+                            elem('ContentEncodingOrder', '5031', 'uint'),
+                            elem('ContentEncodingScope', '5032', 'uint'),
+                            elem('ContentEncodingType',  '5033', 'uint'),
+                            elem('ContentCompression', '5034', {
+                                elem('ContentCompAlgo',     '4254', 'uint'),
+                                elem('ContentCompSettings', '4255', 'binary'),
+                            }),
+                        }),
+                    }),
+                }),
+            }),
+
+            elem('Cues', '1c53bb6b', {
+                elem('CuePoint*', 'bb', {
+                    elem('CueTime', 'b3', 'uint'),
+                    elem('CueTrackPositions*', 'b7', {
+                        elem('CueTrack',           'f7', 'uint'),
+                        elem('CueClusterPosition', 'f1', 'uint'),
+                    }),
+                }),
+            }),
+
+            elem('Attachments', '1941a469', {
+                elem('AttachedFile*', '61a7', {
+                    elem('FileDescription', '467e', 'str'),
+                    elem('FileName',        '466e', 'str'),
+                    elem('FileMimeType',    '4660', 'str'),
+                    elem('FileData',        '465c', 'binary'),
+                    elem('FileUID',         '46ae', 'uint'),
+                }),
+            }),
+
+            elem('Chapters', '1043a770', {
+                elem('EditionEntry*', '45b9', {
+                    elem('EditionUID',         '45bc', 'uint'),
+                    elem('EditionFlagHidden',  '45bd', 'uint'),
+                    elem('EditionFlagDefault', '45db', 'uint'),
+                    elem('EditionFlagOrdered', '45dd', 'uint'),
+                    elem('ChapterAtom*', 'b6', {
+                        elem('ChapterUID',               '73c4', 'uint'),
+                        elem('ChapterTimeStart',           '91', 'uint'),
+                        elem('ChapterTimeEnd',             '92', 'uint'),
+                        elem('ChapterFlagHidden',          '98', 'uint'),
+                        elem('ChapterFlagEnabled',       '4598', 'uint'),
+                        elem('ChapterSegmentUID',        '6e67', 'binary'),
+                        elem('ChapterSegmentEditionUID', '6ebc', 'uint'),
+                        elem('ChapterDisplay*', '80', {
+                            elem('ChapString',      '85', 'str'),
+                            elem('ChapLanguage*', '437c', 'str'),
+                            elem('ChapCountry*',  '437e', 'str'),
+                        }),
+                    }),
+                }),
+            }),
+            elem('Tags*', '1254c367', {
+                elem('Tag*', '7373', {
+                    elem('Targets', '63c0', {
+                        elem('TargetTypeValue',     '68ca', 'uint'),
+                        elem('TargetTrackUID',      '63c5', 'uint'),
+                        elem('TargetEditionUID',    '63c9', 'uint'),
+                        elem('TargetChapterUID',    '63c4', 'uint'),
+                        elem('TargetAttachmentUID', '63c6', 'uint'),
+                     }),
+                    elem('SimpleTag*', '67c8', {
+                        elem('TagName',     '45a3', 'str'),
+                        elem('TagLanguage', '447a', 'str'),
+                        elem('TagString',   '4487', 'str'),
+                    }),
+                }),
+            }),
+        }),
+    );
+}
+
+1;
diff --git a/TOOLS/lib/Parse/Matroska/Element.pm b/TOOLS/lib/Parse/Matroska/Element.pm
new file mode 100644
index 0000000000..fa0830c11e
--- /dev/null
+++ b/TOOLS/lib/Parse/Matroska/Element.pm
@@ -0,0 +1,331 @@
+use 5.008;
+use strict;
+use warnings;
+
+# ABSTRACT: a mid-level representation of an EBML element
+package Parse::Matroska::Element;
+
+use Carp;
+use List::Util qw{first};
+
+=head1 SYNOPSIS
+
+    use Parse::Matroska::Reader;
+    my $reader = Parse::Matroska::Reader->new($path);
+    my $elem = $reader->read_element;
+
+    print "ID: $elem->{elid}\n";
+    print "Name: $elem->{name}\n";
+    print "Length: $elem->{content_len}\n";
+    print "Type: $elem->{type}\n";
+    print "Child count: ", scalar(@{$elem->all_children}), "\n";
+    if ($elem->{type} eq 'sub') {
+        while (my $chld = $elem->next_child) {
+            print "Child Name: $chld->{name}\n";
+        }
+    } else {
+        print "Value: ", $elem->get_value, "\n";
+    }
+
+=head1 DESCRIPTION
+
+Represents a single Matroska element as decoded by
+L<Parse::Matroska::Reader>. This is essentially a hash
+augmented with functions for delay-loading of binary
+values and children elements.
+
+=head1 NOTE
+
+The API of this module is not yet considered stable.
+
+=attr elid
+
+The EBML Element ID, suitable for passing to
+L<Parse::Matroska::Definitions/elem_by_hexid>.
+
+=attr name
+
+The EBML Element's name.
+
+=attr type
+
+The EBML Element's type. Can be C<uint>, C<sint>,
+C<float>, C<ebml_id>, C<str> or C<binary>. See L</value>
+for details.
+
+Equivalent to
+C<elem_by_hexid($elem-E<gt>{value})-E<gt>{valtype}>.
+
+=attr value
+
+The EBML Element's value. Should be obtained through
+L</get_value>.
+
+Is an unicode string if the L</type> is C<str>, that is,
+the string has already been decoded by L<Encode/decode>.
+
+Is C<undef> if the L</type> is C<binary> and the contents
+were delay-loaded and not yet read. L</get_value> will
+do the delayed load if needed.
+
+Is an arrayref if the L</type> is C<sub>, containing
+the children nodes that were already loaded.
+
+Is a hashref if the L</type> is C<ebml_id>, containing
+the referred element's information as defined in
+L<Parse::Matroska::Definitions>. Calling
+C<elem_by_hexid($elem-E<gt>{value}-E<gt>{elid})> will
+return the same object as $elem->{value}.
+
+=attr full_len
+
+The entire length of this EBML Element, including
+the header's.
+
+=attr size_len
+
+The length of the size marker. Used when calculating
+L</full_len> from L</content_len>
+
+=attr content_len
+
+The length of the contents of this EBML Element,
+which excludes the header.
+
+=attr reader
+
+A weakened reference to the associated
+L<Parse::Matroska::Reader>.
+
+=method new(%hash)
+
+Creates a new Element initialized with the hash
+given as argument.
+
+=cut
+sub new {
+    my $class = shift;
+    my $self = {};
+    bless $self, $class;
+
+    $self->initialize(@_);
+    return $self;
+}
+
+=method initialize(%hash)
+
+Called by L</new> on initialization.
+
+=cut
+sub initialize {
+    my ($self, %args) = @_;
+    for (keys %args) {
+        $self->{$_} = $args{$_};
+    }
+    $self->{depth} = 0 unless $self->{depth};
+}
+
+=method skip
+
+Called by the user to ignore the contents of this EBML node.
+Needed when ignoring the children of a node.
+
+=cut
+sub skip {
+    my ($self) = @_;
+    my $reader = $self->{reader};
+    return unless $reader; # we don't have to skip if there's no reader
+    my $pos = $reader->getpos;
+    croak "Too late to skip, reads were already done"
+        if $pos ne $self->{data_pos};
+    $reader->skip($self->{content_len});
+}
+
+=method get_value($keep_bin)
+
+Returns the value contained by this EBML element.
+
+If the element has children, returns an arrayref to
+the children elements that were already encountered.
+
+If the element's type is C<binary> and the value was
+delay-loaded, does the reading now.
+
+If $keep_bin is true, the delay-loaded data is kept
+as the L</value>, otherwise, further calls to
+C<get_value> will reread the data from the L</reader>.
+
+=cut
+sub get_value {
+    my ($self, $keep_bin) = @_;
+
+    return undef if $self->{type} eq 'skip';
+    return $self->{value} if $self->{value};
+
+    my $reader = $self->{reader} or
+        croak "The associated Reader has been deleted";
+
+    # delay-loaded 'binary'
+    if ($self->{type} eq 'binary') {
+        croak "Cannot seek in the current Reader" unless $self->{data_pos};
+        # seek to the data position...
+        $reader->setpos($self->{data_pos});
+        # read the data, keeping it in value if requested
+        if ($keep_bin) {
+            $self->{value} = $reader->readlen($self->{content_len});
+            return $self->{value};
+        } else {
+            return $reader->readlen($self->{content_len});
+        }
+    }
+}
+
+=method next_child($read_bin)
+
+Builtin iterator; reads and returns the next child element.
+Always returns undef if the type isn't C<sub>.
+
+Returns undef at the end of the iterator and resets itself to
+point to the first element; so calling L</next_child($read_bin)>
+after the iterator returned C<undef> will return the first child.
+
+The optional C<$read_bin> parameter has the children elements
+not delay-load their value if their type is C<binary>.
+
+If all children elements have already been read, return
+each element in-order as would be given by
+L</all_children($recurse,$read_bin)>.
+
+=cut
+sub next_child {
+    my ($self, $read_bin) = @_;
+    return unless $self->{type} eq 'sub';
+
+    if ($self->{_all_children_read}) {
+        my $idx = $self->{_last_child} ||= 0;
+        if ($idx == @{$self->{value}}) {
+            # reset the iterator, returning undef once
+            $self->{_last_child} = 0;
+            return;
+        }
+        my $ret = $self->{value}->[$idx];
+
+        ++$idx;
+        $self->{_last_child} = $idx;
+        return $ret;
+    }
+
+    my $len = defined $self->{remaining_len}
+        ? $self->{remaining_len}
+        : $self->{content_len};
+
+    if ($len == 0) {
+        # we've read all children; switch into $self->{value} iteration mode
+        $self->{_all_children_read} = 1;
+        # return undef since the iterator will reset
+        return;
+    }
+
+    $self->{pos_offset} ||= 0;
+    my $pos = $self->{data_pos};
+    my $reader = $self->{reader} or croak "The associated reader has been deleted";
+    $reader->setpos($pos);
+    $reader->{fh}->seek($self->{pos_offset}, 1) if $pos;
+
+    my $chld = $reader->read_element($read_bin);
+    return undef unless defined $chld;
+    $self->{pos_offset} += $chld->{full_len};
+
+    $self->{remaining_len} = $len - $chld->{full_len};
+
+    if ($self->{remaining_len} < 0) {
+        croak "Child elements consumed $self->{remaining_len} more bytes than parent $self->{name} contained";
+    }
+
+    $chld->{depth} = $self->{depth} + 1;
+    $self->{value} ||= [];
+
+    push @{$self->{value}}, $chld;
+
+    return $chld;
+}
+
+=method all_children($recurse,$read_bin)
+
+Calls L</populate_children($recurse,$read_bin)> on self
+and returns an arrayref with the children nodes.
+
+Both C<$recurse> and C<$read_bin> are optional and default
+to false.
+
+=cut
+sub all_children {
+    my ($self, $recurse, $read_bin) = @_;
+    $self->populate_children($recurse, $read_bin);
+    return $self->{value};
+}
+
+=method children_by_name($name)
+
+Searches in the already read children elements for all
+elements with the EBML name C<$name>. Returns an array
+containing all found elements. On scalar context,
+returns only the first element found.
+
+Croaks if the element's C<type> isn't C<sub>.
+
+=cut
+sub children_by_name {
+    my ($self, $name) = @_;
+    return unless defined wantarray; # don't do work if work isn't wanted
+    croak "Element can't have children" unless $self->{type} eq 'sub';
+
+    my @found = grep { $_->{name} eq $name } @{$self->{value}};
+    return @found       if wantarray;         # list
+    return shift @found if defined wantarray; # scalar
+}
+
+=method populate_children($recurse,$read_bin)
+
+Populates the internal array of children elements, that is,
+requests that the associated L<Matroska::Parser::Reader> reads
+all children elements. Returns itself.
+
+Returns false if the element's C<type> isn't C<sub>.
+
+If C<$recurse> is provided and is true, the method will call
+itself in the children elements with the same parameters it
+received; this will build a full EBML tree.
+
+If C<$read_bin> is provided and is true, disables delay-loading
+of the contents of C<binary>-type nodes, reading the contents
+to memory.
+
+If both C<$recurse> and C<$read_bin> are true, entire EBML trees
+can be loaded without requiring seeks, thus behaving correctly
+on unseekable streams. If C<$read_bin> is false, the entire EBML
+tree is still loaded, but calling L</get_value> on C<binary>-type
+nodes will produce an error on unseekable streams.
+
+=cut
+sub populate_children {
+    my ($self, $recurse, $read_bin) = @_;
+
+    return unless $self->{type} eq 'sub';
+
+    if (@{$self->{value}} && $recurse) {
+        # only recurse
+        foreach (@{$self->{value}}) {
+            $_->populate_children($recurse, $read_bin);
+        }
+        return $self;
+    }
+
+    while (my $chld = $self->next_child($read_bin)) {
+        $chld->populate_children($recurse, $read_bin) if $recurse;
+    }
+
+    return $self;
+}
+
+1;
diff --git a/TOOLS/lib/Parse/Matroska/Reader.pm b/TOOLS/lib/Parse/Matroska/Reader.pm
new file mode 100644
index 0000000000..47e67ce5f7
--- /dev/null
+++ b/TOOLS/lib/Parse/Matroska/Reader.pm
@@ -0,0 +1,423 @@
+use 5.008;
+use strict;
+use warnings;
+
+# ABSTRACT: a low-level reader for EBML files
+package Parse::Matroska::Reader;
+
+use Parse::Matroska::Definitions qw{elem_by_hexid};
+use Parse::Matroska::Element;
+
+use Carp;
+use Scalar::Util qw{openhandle weaken};
+use IO::Handle;
+use IO::File;
+use List::Util qw{first};
+use Encode;
+
+use constant BIGINT_TRY => 'Pari,GMP,FastCalc';
+use Math::BigInt try => BIGINT_TRY;
+use Math::BigRat try => BIGINT_TRY;
+
+=head1 SYNOPSIS
+
+    use Parse::Matroska::Reader;
+    my $reader = Parse::Matroska::Reader->new($path);
+    $reader->close;
+    $reader->open(\$string_with_matroska_data);
+
+    my $elem = $reader->read_element;
+    print "Element ID: $elem->{elid}\n";
+    print "Element name: $elem->{name}\n";
+    if ($elem->{type} ne 'sub') {
+        print "Element value: $elem->get_value\n";
+    } else {
+        while (my $child = $elem->next_child) {
+            print "Child element: $child->{name}\n";
+        }
+    }
+    $reader->close;
+
+=head1 DESCRIPTION
+
+Reads EBML data, which is used in Matroska files.
+This is a low-level reader which is meant to be used as a backend
+for higher level readers. TODO: write the high level readers :)
+
+=head1 NOTE
+
+The API of this module is not yet considered stable.
+
+=method new
+
+Creates a new reader.
+Calls L</open($arg)> with its arguments if provided.
+
+=cut
+sub new {
+    my $class = shift;
+    my $self = {};
+    bless $self, $class;
+
+    $self->open(@_) if @_;
+    return $self;
+}
+
+=method open($arg)
+
+Creates the internal filehandle. The argument can be:
+
+=for :list
+* An open filehandle or L<IO::Handle> object.
+The filehandle is not C<dup()>ed, so calling L</close> in this
+object will close the given filehandle as well.
+* A scalar containing a path to a file.
+* On perl v5.14 or newer, a scalarref pointing to EBML data.
+For similar functionality in older perls, give an L<IO::String> object
+or the handle to an already C<open>ed scalarref.
+
+=cut
+sub open {
+    my ($self, $arg) = @_;
+    $self->{fh} = openhandle($arg) || IO::File->new($arg, "<:raw")
+        or croak "Can't open $arg: $!";
+}
+
+=method close
+
+Closes the internal filehandle.
+
+=cut
+sub close {
+    my ($self) = @_;
+    $self->{fh}->close;
+    delete $self->{fh};
+}
+
+# equivalent to $self->readlen(1), possibly faster
+sub _getc {
+    my ($self) = @_;
+    my $c = $self->{fh}->getc;
+    croak "Can't do read of length 1: $!" if !defined $c && $!;
+    return $c;
+}
+
+=method readlen($length)
+
+Reads C<$length> bytes from the internal filehandle.
+
+=cut
+sub readlen {
+    my ($self, $len) = @_;
+    my $data;
+    my $readlen = $self->{fh}->read($data, $len);
+    croak "Can't do read of length $len: $!"
+                 unless defined $readlen;
+    return $data;
+}
+
+# converts a byte string into an integer
+# we do so by converting the integer into a hex string (big-endian)
+# and then reading the hex-string into an integer
+sub _bin2int($) {
+    my ($bin) = @_;
+    # if the length is larger than 3
+    # the resulting integer might be larger than INT_MAX
+    if (length($bin) > 3) {
+        return Math::BigInt->from_hex(unpack("H*", $bin));
+    }
+    return hex(unpack("H*", $bin));
+}
+
+# creates a floating-point number with the given mantissa and exponent
+sub _ldexp {
+    my ($mantissa, $exponent) = @_;
+    return $mantissa * Math::BigRat->new(2)**$exponent;
+}
+
+# NOTE: the read_* functions are hard to read because they're ports
+# of even harder to read python functions.
+# TODO: make them readable
+
+=method read_id
+
+Reads an EBML ID atom in hexadecimal string format, suitable
+for passing to L<Parse::Matroska::Definitions/elem_by_hexid($id)>.
+
+=cut
+sub read_id {
+    my ($self) = @_;
+    my $t = $self->_getc;
+    return undef unless defined $t;
+    my $i = 0;
+    my $mask = 1<<7;
+
+    if (ord($t) == 0) {
+        croak "Matroska Syntax error: first byte of ID was \\0"
+    }
+    until (ord($t) & $mask) {
+        ++$i;
+        $mask >>= 1;
+    }
+    # return hex string of the bytes we just read
+    return unpack "H*", ($t . $self->readlen($i));
+}
+
+=method read_size
+
+Reads an EBML Data Size atom, which immediately follows
+an EBML ID atom.
+
+This returns an array consisting of:
+
+=for :list
+0. The length of the Data Size atom.
+1. The value encoded in the Data Size atom, which is the length of all the data following it.
+
+=cut
+sub read_size {
+    my ($self) = @_;
+    my $t = $self->_getc;
+    my $i = 0;
+    my $mask = 1<<7;
+
+    if (ord($t) == 0) {
+        croak "Matroska Syntax error: first byte of data size was \\0"
+    }
+    until (ord($t) & $mask) {
+        ++$i;
+        $mask >>= 1;
+    }
+    $t = $t & chr($mask-1); # strip length bits (keep only significant bits)
+    return ($i+1, _bin2int $t . $self->readlen($i));
+}
+
+=method read_str($length)
+
+Reads a string of length C<$length> bytes from the internal filehandle.
+The string is already L<Encode/decode>d from C<UTF-8>, which is the
+standard Matroska string encoding.
+
+=cut
+{
+    my $utf8 = find_encoding("UTF-8");
+    sub read_str {
+        my ($self, $length) = @_;
+        return $utf8->decode($self->readlen($length));
+    }
+}
+
+=method read_uint($length)
+
+Reads an unsigned integer of length C<$length> bytes
+from the internal filehandle.
+
+Returns a L<Math::BigInt> object if C<$length> is greater
+than 4.
+
+=cut
+sub read_uint {
+    my ($self, $length) = @_;
+    return _bin2int $self->readlen($length);
+}
+
+=method read_sint($length)
+
+Reads a signed integer of length C<$length> bytes
+from the internal filehandle.
+
+Returns a L<Math::BigInt> object if C<$length> is greater
+than 4.
+
+=cut
+sub read_sint {
+    my ($self, $length) = @_;
+    my $i = $self->read_uint($length);
+
+    # Apply 2's complement to the unsigned int
+    my $mask = int(2 ** ($length * 8 - 1));
+    # if the most significant bit is set...
+    if ($i & $mask) {
+        # subtract the MSB twice
+        $i -= 2 * $mask;
+    }
+    return $i;
+}
+
+=method read_float($length)
+
+Reads an IEEE floating point number of length C<$length>
+bytes from the internal filehandle.
+
+Only lengths C<4> and C<8> are supported (C C<float> and C<double>).
+
+=cut
+sub read_float {
+    my ($self, $length) = @_;
+    my $i = $self->read_uint($length);
+    my $f;
+
+    use bigrat try => BIGINT_TRY;
+
+    # These evil expressions reinterpret an unsigned int as IEEE binary floats
+    if ($length == 4) {
+        $f = _ldexp(($i & (1<<23 - 1)) + (1<<23), ($i>>23 & (1<<8 - 1)) - 150);
+        $f = -$f if $i & (1<<31);
+    } elsif ($length == 8) {
+        $f = _ldexp(($i & (1<<52 - 1)) + (1<<52), ($i>>52 & (1<<12 - 1)) - 1075);
+        $f = -$f if $i & (1<<63);
+    } else {
+        croak "Matroska Syntax error: unsupported IEEE float byte size $length";
+    }
+
+    return $f;
+}
+
+=method read_ebml_id($length)
+
+Reads an EBML ID when it's encoded as the data inside another
+EBML element, that is, when the enclosing element's C<type> is
+C<ebml_id>.
+
+This returns a hashref with the EBML element description as
+defined in L<Parse::Matroska::Definitions>.
+
+=cut
+sub read_ebml_id {
+    my ($self, $length) = @_;
+    return elem_by_hexid(unpack("H*", $self->readlen($length)));
+}
+
+=method skip($length)
+
+Skips C<$length> bytes in the internal filehandle.
+
+=cut
+sub skip {
+    my ($self, $len) = @_;
+    return if $self->{fh}->can('seek') && $self->{fh}->seek($len, 1);
+    $self->readlen($len);
+    return;
+}
+
+=method getpos
+
+Wrapper for L<IO::Seekable/$io-E<gt>getpos> in the internal filehandle.
+
+Returns undef if the internal filehandle can't C<getpos>.
+
+=cut
+sub getpos {
+    my ($self) = @_;
+    return undef unless $self->{fh}->can('getpos');
+    return $self->{fh}->getpos;
+}
+
+=method setpos($pos)
+
+Wrapper for L<IO::Seekable/$io-E<gt>setpos> in the internal filehandle.
+
+Returns C<undef> if the internal filehandle can't C<setpos>.
+
+Croaks if C<setpos> does not seek to the requested position,
+that is, if calling C<getpos> does not yield the same object
+as the C<$pos> argument.
+
+=cut
+sub setpos {
+    my ($self, $pos) = @_;
+    return undef unless $pos && $self->{fh}->can('setpos');
+
+    my $ret = $self->{fh}->setpos($pos);
+    croak "Cannot seek to correct position"
+        unless $self->getpos eq $pos;
+    return $ret;
+}
+
+=method read_element($read_bin)
+
+Reads a full EBML element from the internal filehandle.
+
+Returns a L<Parse::Matroska::Element> object initialized with
+the read data. If C<read_bin> is not present or is false, will
+delay-load the contents of C<binary> type elements, that is,
+they will only be loaded when calling C<get_value> on the
+returned L<Parse::Matroska::Element> object.
+
+Does not read the children of the element if its type is
+C<sub>. Look into the L<Parse::Matroska::Element> interface
+for details in how to read children elements.
+
+Pass a true C<$read_bin> if the stream being read is not
+seekable (C<getpos> is undef) and the contents of C<binary>
+elements is desired, otherwise seeking errors or internal
+filehandle corruption might occur.
+
+=cut
+sub read_element {
+    my ($self, $read_bin) = @_;
+    return undef if $self->{fh}->eof;
+
+    my $elem_pos = $self->getpos;
+
+    my $elid = $self->read_id;
+    my $elem_def = elem_by_hexid($elid);
+    my ($size_len, $content_len) = $self->read_size;
+    my $full_len = length($elid)/2 + $size_len + $content_len;
+
+    my $elem = Parse::Matroska::Element->new(
+        elid => $elid,
+        name => $elem_def && $elem_def->{name},
+        type => $elem_def && $elem_def->{valtype},
+        size_len => $size_len,
+        content_len => $content_len,
+        full_len => $full_len,
+        reader => $self,
+        elem_pos => $elem_pos,
+        data_pos => $self->getpos,
+        );
+    weaken($elem->{reader});
+
+    if (defined $elem_def) {
+        if ($elem->{type} eq 'sub') {
+            $elem->{value} = [];
+        } elsif ($elem->{type} eq 'str') {
+            $elem->{value} = $self->read_str($content_len);
+        } elsif ($elem->{type} eq 'ebml_id') {
+            $elem->{value} = $self->read_ebml_id($content_len);
+        } elsif ($elem->{type} eq 'uint') {
+            $elem->{value} = $self->read_uint($content_len);
+        } elsif ($elem->{type} eq 'sint') {
+            $elem->{value} = $self->read_sint($content_len);
+        } elsif ($elem->{type} eq 'float') {
+            $elem->{value} = $self->read_float($content_len);
+        } elsif ($elem->{type} eq 'skip') {
+            $self->skip($content_len);
+        } elsif ($elem->{type} eq 'binary') {
+            if ($read_bin) {
+                $elem->{value} = $self->readlen($content_len);
+            } else {
+                $self->skip($content_len);
+            }
+        } else {
+            die "Matroska Definition error: type $elem->{valtype} unknown"
+        }
+    } else {
+        $self->skip($content_len);
+    }
+    return $elem;
+}
+
+1;
+
+=head1 CAVEATS
+
+Children elements have to be processed as soon as an element
+with children is found, or their children ignored with
+L<Parse::Matroska::Element/skip>. Not doing so doesn't cause
+errors but results in an invalid structure, with constant C<0>
+depth.
+
+To work correctly in unseekable streams, either the contents
+of C<binary>-type elements has to be ignored or the C<read_bin>
+flag to C<read_element> has to be true.
diff --git a/TOOLS/lib/Parse/Matroska/Utils.pm b/TOOLS/lib/Parse/Matroska/Utils.pm
new file mode 100644
index 0000000000..127d626cb1
--- /dev/null
+++ b/TOOLS/lib/Parse/Matroska/Utils.pm
@@ -0,0 +1,37 @@
+use strict;
+use warnings;
+
+# ABSTRACT: internally-used helper functions
+package Parse::Matroska::Utils;
+
+use Exporter;
+our @ISA       = qw{Exporter};
+our @EXPORT_OK = qw{uniq uncamelize};
+
+=method uniq(@array)
+
+The same as L<List::MoreUtils/"uniq LIST">.
+Included to avoid depending on it since it's
+not a core module.
+
+=cut
+sub uniq(@) {
+  my %seen;
+  return grep { !$seen{$_}++ } @_;
+}
+
+=method uncamelize($string)
+
+Converts a "StringLikeTHIS" into a
+"string_like_this".
+
+=cut
+sub uncamelize($) {
+    local $_ = shift;
+    # lc followed by UC: lc_UC
+    s/(?<=[a-z])([A-Z])/_\L$1/g;
+    # UC followed by two lc: _UClclc
+    s/([A-Z])(?=[a-z]{2})/_\L$1/g;
+    # strip leading _ that the second regexp might add; lowercase all
+    s/^_//; lc
+}
-- 
cgit v1.2.3