From ff9f5e06ff203c055d968087956026ef9204218b Mon Sep 17 00:00:00 2001 From: wm4 Date: Sat, 17 Dec 2016 13:24:05 +0100 Subject: Revert "Port several python scripts to Perl" This reverts commit fae73079310eef9dce9737f2e37ff4b80c8830ee. Before the waf build system was used, we had a configure script written in shell. To drop the build dependency on Python, someone rewrote the Python scripts we had to Perl. Now the shell configure script is gone, and it makes no sense to have a build dependency on both Perl and Python. This isn't just a straight revert. It adds the new Matroska EBML elements to the old Python scripts, adjusts the waf build system, and of course doesn't add anything back needed by the old build system. It would be better if this used matroska.py/file2string.py directly by importing them as modules, instead of calling them via "python". But for now this is simpler. --- TOOLS/lib/Parse/Matroska.pm | 30 --- TOOLS/lib/Parse/Matroska/Definitions.pm | 384 ---------------------------- TOOLS/lib/Parse/Matroska/Element.pm | 331 ------------------------- TOOLS/lib/Parse/Matroska/Reader.pm | 426 -------------------------------- TOOLS/lib/Parse/Matroska/Utils.pm | 37 --- 5 files changed, 1208 deletions(-) delete mode 100644 TOOLS/lib/Parse/Matroska.pm delete mode 100644 TOOLS/lib/Parse/Matroska/Definitions.pm delete mode 100644 TOOLS/lib/Parse/Matroska/Element.pm delete mode 100644 TOOLS/lib/Parse/Matroska/Reader.pm delete mode 100644 TOOLS/lib/Parse/Matroska/Utils.pm (limited to 'TOOLS/lib/Parse') diff --git a/TOOLS/lib/Parse/Matroska.pm b/TOOLS/lib/Parse/Matroska.pm deleted file mode 100644 index e1c08c9814..0000000000 --- a/TOOLS/lib/Parse/Matroska.pm +++ /dev/null @@ -1,30 +0,0 @@ -use 5.008; -use strict; -use warnings; - -# ABSTRACT: Module collection to parse Matroska files. -package Parse::Matroska; - -=head1 DESCRIPTION - -Cs L. See the documentation -of the modules mentioned in L for more information -in how to use this module. - -It's intended for this module to contain high-level interfaces -to the other modules in the distribution. - -=head1 SOURCE CODE - -L - -=head1 SEE ALSO - -L, L, -L. - -=cut - -use Parse::Matroska::Reader; - -1; diff --git a/TOOLS/lib/Parse/Matroska/Definitions.pm b/TOOLS/lib/Parse/Matroska/Definitions.pm deleted file mode 100644 index 5a5adcd6de..0000000000 --- a/TOOLS/lib/Parse/Matroska/Definitions.pm +++ /dev/null @@ -1,384 +0,0 @@ -use 5.008; -use strict; -use warnings; - -# ABSTRACT: internal EBML grammar definitions -package Parse::Matroska::Definitions; - -use Parse::Matroska::Utils qw{uniq uncamelize}; - -use Exporter; -our @ISA = qw{Exporter}; -our @EXPORT_OK = qw{elem_by_hexid %EBML_DEFINITION %MATROSKA_DEFINITION}; - -=head1 SYNOPSIS - - use Parse::Matroska::Definitions qw{elem_by_hexid}; - my $ebml_id = elem_by_hexid('1a45dfa3'); - print "EBML ID $ebml_id->{elid}'s name: $ebml_id->{name}"; - -=head1 DESCRIPTION - -Contains the definition of the EBML grammar as expected in -Matroska files. This module is meant mostly for internal use. - -As this was extended from a script in mpv-player, some data -generated is apparently useless for regular module users -but is still relevant to the mpv-player script. Such data -is annotated as being for mpv compatibility. - -=head1 NOTE - -The API of this module is not yet considered stable. - -=head1 GLOBALS - -These global variables are considered B. - -=head2 @Parse::Matroska::Definitions::global_elem_list - -A global list of known matroska elements. Useful for -mpv's matroska script, used for generating C headers -that parse matroska. - -=head2 %Parse::Matroska::Definitions::global_elem_dict - -A global hash of known matroska elements. Used internally -by L. - -=cut - -@Parse::Matroska::Definitions::global_elem_list = (); -%Parse::Matroska::Definitions::global_elem_dict = (); - -=head2 %EBML_DEFINITION - -Optionally-importable hash of known EBML IDs belonging -to the EBML generic grammar. - -=head2 %MATROSKA_DEFINITION - -Optionally-importable hash of known EBML IDs belonging -to the Matroska-specific grammar. - -=cut - -our %EBML_DEFINITION = define_ebml(); -our %MATROSKA_DEFINITION = define_matroska(); - -=method elem_by_hexid($id) - -Returns an EBML Element Definition corresponding to the provided -hexadecimal string. Returns C if the element is unknown. - -=cut -sub elem_by_hexid { - my ($elid) = @_; - return $Parse::Matroska::Definitions::global_elem_dict{$elid}; -} - -################################################ -### Helper functions for document definition ### -################################################ - -# used by elem when setting the 'valname' key -use constant TYPE_MAP => { - uint => 'uint64_t', - str => 'char *', - binary => 'struct bstr', - ebml_id => 'uint32_t', - float => 'double', - sint => 'int64_t', -}; - -# this will be localized to "MATROSKA" or "EBML" on the elem declarations -our $ELEM_DEFINE_TYPE = undef; - -=method elem($name,$elid,$valtype) - -NOTE: never call this function yourself; it changes data structures -that are considered immutable outside of this package. - -Internal API function that generates the EBML Element Definitions. - -This API function returns an array which first element is C<$elid> -and the second is a generated hash. The generated hash is stored -in the @global_elem_list and %global_elem_dict. - -The generated hash contains: - -=for :list -= name -The EBML Element's name, given through C<$name>. -= elid -The EBML Element's hex id, given through C<$elid>. Used for lookups by L. -= valtype -The EBML Element's type, given through C<$valtype>, except when C<$valtype> is an arrayref. -= multiple -If C<$name> ends with a C<*>, this is set as true and strips the C<*> from L. Used to -mark elements that may be repeated. -= subelements -An arrayref of elements that may be children of this element, given through C<$valtype> if it -is an arrayref. Sets L to C if there are subelements. -= subids -An arrayref listing all the Ls of subelements, Cified. - -The following elements are for mpv compatibility: - -=for :list -= definename -Name used for generating C #defines. -= fieldname -Name used for generating C struct fields. -= structname -Name used for generating C struct names. -= ebmltype -A pre-#defined constant to describe the element's type. -= valname -Typename used when declaring a struct field referring to this element. - -=cut -sub elem { - my %e = (name => shift, elid => shift, valtype => shift); - - # strip * from name, set 'multiple' if there was one - $e{multiple} = scalar $e{name} =~ s/\*$//; - - # ELEM_DEFINE_TYPE is either MATROSKA or EBML - $e{definename} = "${ELEM_DEFINE_TYPE}_ID_".uc($e{name}); - $e{fieldname} = uncamelize $e{name}; - $e{structname} = "ebml_$e{fieldname}"; - - if (ref $e{valtype} eq 'HASH') { - $e{subelements} = $e{valtype}; - $e{subids} = uniq map { $_->{elid} } values %{$e{subelements}}; - $e{valtype} = 'sub'; - $e{ebmltype} = 'EBML_TYPE_SUBELEMENTS'; - $e{valname} = "struct $e{structname}"; - } else { - $e{ebmltype} = "EBML_TYPE_\U$e{valtype}"; - die "Unrecognized value type $e{valtype}" unless - defined ($e{valname} = TYPE_MAP->{$e{valtype}}); - } - my $e = \%e; - push @Parse::Matroska::Definitions::global_elem_list, $e; - $Parse::Matroska::Definitions::global_elem_dict{$e{elid}} = $e; - return ($e{elid}, $e); -} - -############################################# -### EBML and Matroska document definitons ### -############################################# - -=method define_ebml - -Internal function that defines the EBML generic grammar. - -Must not be called from outside the package. - -=cut -sub define_ebml { - local $ELEM_DEFINE_TYPE = 'EBML'; - return ( - elem('EBML', '1a45dfa3', { - elem('EBMLVersion', '4286', 'uint'), - elem('EBMLReadVersion', '42f7', 'uint'), - elem('EBMLMaxIDLength', '42f2', 'uint'), - elem('EBMLMaxSizeLength', '42f3', 'uint'), - elem('DocType', '4282', 'str'), - elem('DocTypeVersion', '4287', 'uint'), - elem('DocTypeReadVersion', '4285', 'uint'), - }), - - elem('CRC32', 'bf', 'binary'), - elem('Void', 'ec', 'binary'), - ); -} - - -=method define_matroska - -Internal function that defines the Matroska-specific EBML grammar. - -Must not be called from outside the package. - -=cut -sub define_matroska { - local $ELEM_DEFINE_TYPE = 'MATROSKA'; - return ( - elem('Segment', '18538067', { - elem('SeekHead*', '114d9b74', { - elem('Seek*', '4dbb', { - elem('SeekID', '53ab', 'ebml_id'), - elem('SeekPosition', '53ac', 'uint'), - }), - }), - - elem('Info*', '1549a966', { - elem('SegmentUID', '73a4', 'binary'), - elem('PrevUID', '3cb923', 'binary'), - elem('NextUID', '3eb923', 'binary'), - elem('TimecodeScale', '2ad7b1', 'uint'), - elem('DateUTC', '4461', 'sint'), - elem('Title', '7ba9', 'str'), - elem('MuxingApp', '4d80', 'str'), - elem('WritingApp', '5741', 'str'), - elem('Duration', '4489', 'float'), - }), - - elem('Cluster*', '1f43b675', { - elem('Timecode', 'e7', 'uint'), - elem('BlockGroup*', 'a0', { - elem('Block', 'a1', 'binary'), - elem('BlockDuration', '9b', 'uint'), - elem('ReferenceBlock*', 'fb', 'sint'), - elem('DiscardPadding', '75A2', 'sint'), - }), - elem('SimpleBlock*', 'a3', 'binary'), - }), - - elem('Tracks*', '1654ae6b', { - elem('TrackEntry*', 'ae', { - elem('TrackNumber', 'd7', 'uint'), - elem('TrackUID', '73c5', 'uint'), - elem('TrackType', '83', 'uint'), - elem('FlagEnabled', 'b9', 'uint'), - elem('FlagDefault', '88', 'uint'), - elem('FlagForced', '55aa', 'uint'), - elem('FlagLacing', '9c', 'uint'), - elem('MinCache', '6de7', 'uint'), - elem('MaxCache', '6df8', 'uint'), - elem('DefaultDuration', '23e383', 'uint'), - elem('TrackTimecodeScale', '23314f', 'float'), - elem('MaxBlockAdditionID', '55ee', 'uint'), - elem('Name', '536e', 'str'), - elem('Language', '22b59c', 'str'), - elem('CodecID', '86', 'str'), - elem('CodecPrivate', '63a2', 'binary'), - elem('CodecName', '258688', 'str'), - elem('CodecDecodeAll', 'aa', 'uint'), - elem('CodecDelay', '56AA', 'uint'), - elem('SeekPreRoll', '56BB', 'uint'), - elem('Video', 'e0', { - elem('FlagInterlaced', '9a', 'uint'), - elem('PixelWidth', 'b0', 'uint'), - elem('PixelHeight', 'ba', 'uint'), - elem('DisplayWidth', '54b0', 'uint'), - elem('DisplayHeight', '54ba', 'uint'), - elem('DisplayUnit', '54b2', 'uint'), - elem('FrameRate', '2383e3', 'float'), - elem('ColourSpace', '2eb524', 'binary'), - elem('StereoMode', '53b8', 'uint'), - elem('Colour', '55B0', { - elem('MatrixCoefficients', '55B1', 'uint'), - elem('BitsPerChannel', '55B2', 'uint'), - elem('ChromaSubsamplingHorz', '55B3', 'uint'), - elem('ChromaSubsamplingVert', '55B4', 'uint'), - elem('CbSubsamplingHorz', '55B5', 'uint'), - elem('CbSubsamplingVert', '55B6', 'uint'), - elem('ChromaSitingHorz', '55B7', 'uint'), - elem('ChromaSitingVert', '55B8', 'uint'), - elem('Range', '55B9', 'uint'), - elem('TransferCharacteristics', '55BA', 'uint'), - elem('Primaries', '55BB', 'uint'), - elem('MaxCLL', '55BC', 'uint'), - elem('MaxFALL', '55BD', 'uint'), - elem('MasteringMetadata', '55D0', { - elem('PrimaryRChromaticityX', '55D1', 'float'), - elem('PrimaryRChromaticityY', '55D2', 'float'), - elem('PrimaryGChromaticityX', '55D3', 'float'), - elem('PrimaryGChromaticityY', '55D4', 'float'), - elem('PrimaryBChromaticityX', '55D5', 'float'), - elem('PrimaryBChromaticityY', '55D6', 'float'), - elem('WhitePointChromaticityX', '55D7', 'float'), - elem('WhitePointChromaticityY', '55D8', 'float'), - elem('LuminanceMax', '55D9', 'float'), - elem('LuminanceMin', '55DA', 'float'), - }), - }), - }), - elem('Audio', 'e1', { - elem('SamplingFrequency', 'b5', 'float'), - elem('OutputSamplingFrequency', '78b5', 'float'), - elem('Channels', '9f', 'uint'), - elem('BitDepth', '6264', 'uint'), - }), - elem('ContentEncodings', '6d80', { - elem('ContentEncoding*', '6240', { - elem('ContentEncodingOrder', '5031', 'uint'), - elem('ContentEncodingScope', '5032', 'uint'), - elem('ContentEncodingType', '5033', 'uint'), - elem('ContentCompression', '5034', { - elem('ContentCompAlgo', '4254', 'uint'), - elem('ContentCompSettings', '4255', 'binary'), - }), - }), - }), - }), - }), - - elem('Cues', '1c53bb6b', { - elem('CuePoint*', 'bb', { - elem('CueTime', 'b3', 'uint'), - elem('CueTrackPositions*', 'b7', { - elem('CueTrack', 'f7', 'uint'), - elem('CueClusterPosition', 'f1', 'uint'), - elem('CueRelativePosition','f0', 'uint'), - elem('CueDuration', 'b2', 'uint'), - }), - }), - }), - - elem('Attachments', '1941a469', { - elem('AttachedFile*', '61a7', { - elem('FileDescription', '467e', 'str'), - elem('FileName', '466e', 'str'), - elem('FileMimeType', '4660', 'str'), - elem('FileData', '465c', 'binary'), - elem('FileUID', '46ae', 'uint'), - }), - }), - - elem('Chapters', '1043a770', { - elem('EditionEntry*', '45b9', { - elem('EditionUID', '45bc', 'uint'), - elem('EditionFlagHidden', '45bd', 'uint'), - elem('EditionFlagDefault', '45db', 'uint'), - elem('EditionFlagOrdered', '45dd', 'uint'), - elem('ChapterAtom*', 'b6', { - elem('ChapterUID', '73c4', 'uint'), - elem('ChapterTimeStart', '91', 'uint'), - elem('ChapterTimeEnd', '92', 'uint'), - elem('ChapterFlagHidden', '98', 'uint'), - elem('ChapterFlagEnabled', '4598', 'uint'), - elem('ChapterSegmentUID', '6e67', 'binary'), - elem('ChapterSegmentEditionUID', '6ebc', 'uint'), - elem('ChapterDisplay*', '80', { - elem('ChapString', '85', 'str'), - elem('ChapLanguage*', '437c', 'str'), - elem('ChapCountry*', '437e', 'str'), - }), - }), - }), - }), - elem('Tags*', '1254c367', { - elem('Tag*', '7373', { - elem('Targets', '63c0', { - elem('TargetTypeValue', '68ca', 'uint'), - elem('TargetTrackUID', '63c5', 'uint'), - elem('TargetEditionUID', '63c9', 'uint'), - elem('TargetChapterUID', '63c4', 'uint'), - elem('TargetAttachmentUID', '63c6', 'uint'), - }), - elem('SimpleTag*', '67c8', { - elem('TagName', '45a3', 'str'), - elem('TagLanguage', '447a', 'str'), - elem('TagString', '4487', 'str'), - }), - }), - }), - }), - ); -} - -1; diff --git a/TOOLS/lib/Parse/Matroska/Element.pm b/TOOLS/lib/Parse/Matroska/Element.pm deleted file mode 100644 index fa0830c11e..0000000000 --- a/TOOLS/lib/Parse/Matroska/Element.pm +++ /dev/null @@ -1,331 +0,0 @@ -use 5.008; -use strict; -use warnings; - -# ABSTRACT: a mid-level representation of an EBML element -package Parse::Matroska::Element; - -use Carp; -use List::Util qw{first}; - -=head1 SYNOPSIS - - use Parse::Matroska::Reader; - my $reader = Parse::Matroska::Reader->new($path); - my $elem = $reader->read_element; - - print "ID: $elem->{elid}\n"; - print "Name: $elem->{name}\n"; - print "Length: $elem->{content_len}\n"; - print "Type: $elem->{type}\n"; - print "Child count: ", scalar(@{$elem->all_children}), "\n"; - if ($elem->{type} eq 'sub') { - while (my $chld = $elem->next_child) { - print "Child Name: $chld->{name}\n"; - } - } else { - print "Value: ", $elem->get_value, "\n"; - } - -=head1 DESCRIPTION - -Represents a single Matroska element as decoded by -L. This is essentially a hash -augmented with functions for delay-loading of binary -values and children elements. - -=head1 NOTE - -The API of this module is not yet considered stable. - -=attr elid - -The EBML Element ID, suitable for passing to -L. - -=attr name - -The EBML Element's name. - -=attr type - -The EBML Element's type. Can be C, C, -C, C, C or C. See L -for details. - -Equivalent to -C{value})-E{valtype}>. - -=attr value - -The EBML Element's value. Should be obtained through -L. - -Is an unicode string if the L is C, that is, -the string has already been decoded by L. - -Is C if the L is C and the contents -were delay-loaded and not yet read. L will -do the delayed load if needed. - -Is an arrayref if the L is C, containing -the children nodes that were already loaded. - -Is a hashref if the L is C, containing -the referred element's information as defined in -L. Calling -C{value}-E{elid})> will -return the same object as $elem->{value}. - -=attr full_len - -The entire length of this EBML Element, including -the header's. - -=attr size_len - -The length of the size marker. Used when calculating -L from L - -=attr content_len - -The length of the contents of this EBML Element, -which excludes the header. - -=attr reader - -A weakened reference to the associated -L. - -=method new(%hash) - -Creates a new Element initialized with the hash -given as argument. - -=cut -sub new { - my $class = shift; - my $self = {}; - bless $self, $class; - - $self->initialize(@_); - return $self; -} - -=method initialize(%hash) - -Called by L on initialization. - -=cut -sub initialize { - my ($self, %args) = @_; - for (keys %args) { - $self->{$_} = $args{$_}; - } - $self->{depth} = 0 unless $self->{depth}; -} - -=method skip - -Called by the user to ignore the contents of this EBML node. -Needed when ignoring the children of a node. - -=cut -sub skip { - my ($self) = @_; - my $reader = $self->{reader}; - return unless $reader; # we don't have to skip if there's no reader - my $pos = $reader->getpos; - croak "Too late to skip, reads were already done" - if $pos ne $self->{data_pos}; - $reader->skip($self->{content_len}); -} - -=method get_value($keep_bin) - -Returns the value contained by this EBML element. - -If the element has children, returns an arrayref to -the children elements that were already encountered. - -If the element's type is C and the value was -delay-loaded, does the reading now. - -If $keep_bin is true, the delay-loaded data is kept -as the L, otherwise, further calls to -C will reread the data from the L. - -=cut -sub get_value { - my ($self, $keep_bin) = @_; - - return undef if $self->{type} eq 'skip'; - return $self->{value} if $self->{value}; - - my $reader = $self->{reader} or - croak "The associated Reader has been deleted"; - - # delay-loaded 'binary' - if ($self->{type} eq 'binary') { - croak "Cannot seek in the current Reader" unless $self->{data_pos}; - # seek to the data position... - $reader->setpos($self->{data_pos}); - # read the data, keeping it in value if requested - if ($keep_bin) { - $self->{value} = $reader->readlen($self->{content_len}); - return $self->{value}; - } else { - return $reader->readlen($self->{content_len}); - } - } -} - -=method next_child($read_bin) - -Builtin iterator; reads and returns the next child element. -Always returns undef if the type isn't C. - -Returns undef at the end of the iterator and resets itself to -point to the first element; so calling L -after the iterator returned C will return the first child. - -The optional C<$read_bin> parameter has the children elements -not delay-load their value if their type is C. - -If all children elements have already been read, return -each element in-order as would be given by -L. - -=cut -sub next_child { - my ($self, $read_bin) = @_; - return unless $self->{type} eq 'sub'; - - if ($self->{_all_children_read}) { - my $idx = $self->{_last_child} ||= 0; - if ($idx == @{$self->{value}}) { - # reset the iterator, returning undef once - $self->{_last_child} = 0; - return; - } - my $ret = $self->{value}->[$idx]; - - ++$idx; - $self->{_last_child} = $idx; - return $ret; - } - - my $len = defined $self->{remaining_len} - ? $self->{remaining_len} - : $self->{content_len}; - - if ($len == 0) { - # we've read all children; switch into $self->{value} iteration mode - $self->{_all_children_read} = 1; - # return undef since the iterator will reset - return; - } - - $self->{pos_offset} ||= 0; - my $pos = $self->{data_pos}; - my $reader = $self->{reader} or croak "The associated reader has been deleted"; - $reader->setpos($pos); - $reader->{fh}->seek($self->{pos_offset}, 1) if $pos; - - my $chld = $reader->read_element($read_bin); - return undef unless defined $chld; - $self->{pos_offset} += $chld->{full_len}; - - $self->{remaining_len} = $len - $chld->{full_len}; - - if ($self->{remaining_len} < 0) { - croak "Child elements consumed $self->{remaining_len} more bytes than parent $self->{name} contained"; - } - - $chld->{depth} = $self->{depth} + 1; - $self->{value} ||= []; - - push @{$self->{value}}, $chld; - - return $chld; -} - -=method all_children($recurse,$read_bin) - -Calls L on self -and returns an arrayref with the children nodes. - -Both C<$recurse> and C<$read_bin> are optional and default -to false. - -=cut -sub all_children { - my ($self, $recurse, $read_bin) = @_; - $self->populate_children($recurse, $read_bin); - return $self->{value}; -} - -=method children_by_name($name) - -Searches in the already read children elements for all -elements with the EBML name C<$name>. Returns an array -containing all found elements. On scalar context, -returns only the first element found. - -Croaks if the element's C isn't C. - -=cut -sub children_by_name { - my ($self, $name) = @_; - return unless defined wantarray; # don't do work if work isn't wanted - croak "Element can't have children" unless $self->{type} eq 'sub'; - - my @found = grep { $_->{name} eq $name } @{$self->{value}}; - return @found if wantarray; # list - return shift @found if defined wantarray; # scalar -} - -=method populate_children($recurse,$read_bin) - -Populates the internal array of children elements, that is, -requests that the associated L reads -all children elements. Returns itself. - -Returns false if the element's C isn't C. - -If C<$recurse> is provided and is true, the method will call -itself in the children elements with the same parameters it -received; this will build a full EBML tree. - -If C<$read_bin> is provided and is true, disables delay-loading -of the contents of C-type nodes, reading the contents -to memory. - -If both C<$recurse> and C<$read_bin> are true, entire EBML trees -can be loaded without requiring seeks, thus behaving correctly -on unseekable streams. If C<$read_bin> is false, the entire EBML -tree is still loaded, but calling L on C-type -nodes will produce an error on unseekable streams. - -=cut -sub populate_children { - my ($self, $recurse, $read_bin) = @_; - - return unless $self->{type} eq 'sub'; - - if (@{$self->{value}} && $recurse) { - # only recurse - foreach (@{$self->{value}}) { - $_->populate_children($recurse, $read_bin); - } - return $self; - } - - while (my $chld = $self->next_child($read_bin)) { - $chld->populate_children($recurse, $read_bin) if $recurse; - } - - return $self; -} - -1; diff --git a/TOOLS/lib/Parse/Matroska/Reader.pm b/TOOLS/lib/Parse/Matroska/Reader.pm deleted file mode 100644 index 614b7b12c0..0000000000 --- a/TOOLS/lib/Parse/Matroska/Reader.pm +++ /dev/null @@ -1,426 +0,0 @@ -use 5.008; -use strict; -use warnings; - -# ABSTRACT: a low-level reader for EBML files -package Parse::Matroska::Reader; - -use Parse::Matroska::Definitions qw{elem_by_hexid}; -use Parse::Matroska::Element; - -use Carp; -use Scalar::Util qw{openhandle weaken}; -use IO::Handle; -use IO::File; -use List::Util qw{first}; -use Encode; - -use constant BIGINT_TRY => 'Pari,GMP,FastCalc'; -use Math::BigInt try => BIGINT_TRY; -use Math::BigRat try => BIGINT_TRY; - -=head1 SYNOPSIS - - use Parse::Matroska::Reader; - my $reader = Parse::Matroska::Reader->new($path); - $reader->close; - $reader->open(\$string_with_matroska_data); - - my $elem = $reader->read_element; - print "Element ID: $elem->{elid}\n"; - print "Element name: $elem->{name}\n"; - if ($elem->{type} ne 'sub') { - print "Element value: $elem->get_value\n"; - } else { - while (my $child = $elem->next_child) { - print "Child element: $child->{name}\n"; - } - } - $reader->close; - -=head1 DESCRIPTION - -Reads EBML data, which is used in Matroska files. -This is a low-level reader which is meant to be used as a backend -for higher level readers. TODO: write the high level readers :) - -=head1 NOTE - -The API of this module is not yet considered stable. - -=method new - -Creates a new reader. -Calls L with its arguments if provided. - -=cut -sub new { - my $class = shift; - my $self = {}; - bless $self, $class; - - $self->open(@_) if @_; - return $self; -} - -=method open($arg) - -Creates the internal filehandle. The argument can be: - -=for :list -* An open filehandle or L object. -The filehandle is not Ced, so calling L in this -object will close the given filehandle as well. -* A scalar containing a path to a file. -* On perl v5.14 or newer, a scalarref pointing to EBML data. -For similar functionality in older perls, give an L object -or the handle to an already Ced scalarref. - -=cut -sub open { - my ($self, $arg) = @_; - $self->{fh} = openhandle($arg) || IO::File->new($arg, "<:raw") - or croak "Can't open $arg: $!"; -} - -=method close - -Closes the internal filehandle. - -=cut -sub close { - my ($self) = @_; - $self->{fh}->close; - delete $self->{fh}; -} - -# equivalent to $self->readlen(1), possibly faster -sub _getc { - my ($self) = @_; - my $c = $self->{fh}->getc; - croak "Can't do read of length 1: $!" if !defined $c && $!; - return $c; -} - -=method readlen($length) - -Reads C<$length> bytes from the internal filehandle. - -=cut -sub readlen { - my ($self, $len) = @_; - my $data; - my $readlen = $self->{fh}->read($data, $len); - croak "Can't do read of length $len: $!" - unless defined $readlen; - return $data; -} - -# converts a byte string into an integer -# we do so by converting the integer into a hex string (big-endian) -# and then reading the hex-string into an integer -sub _bin2int($) { - my ($bin) = @_; - # if the length is larger than 3 - # the resulting integer might be larger than INT_MAX - if (length($bin) > 3) { - return Math::BigInt->from_hex(unpack("H*", $bin)); - } - return hex(unpack("H*", $bin)); -} - -# creates a floating-point number with the given mantissa and exponent -sub _ldexp { - my ($mantissa, $exponent) = @_; - my $r = new Math::BigRat($mantissa); - return $r * Math::BigRat->new(2)**$exponent; -} - -# NOTE: the read_* functions are hard to read because they're ports -# of even harder to read python functions. -# TODO: make them readable - -=method read_id - -Reads an EBML ID atom in hexadecimal string format, suitable -for passing to L. - -=cut -sub read_id { - my ($self) = @_; - my $t = $self->_getc; - return undef unless defined $t; - my $i = 0; - my $mask = 1<<7; - - if (ord($t) == 0) { - croak "Matroska Syntax error: first byte of ID was \\0" - } - until (ord($t) & $mask) { - ++$i; - $mask >>= 1; - } - # return hex string of the bytes we just read - return unpack "H*", ($t . $self->readlen($i)); -} - -=method read_size - -Reads an EBML Data Size atom, which immediately follows -an EBML ID atom. - -This returns an array consisting of: - -=for :list -0. The length of the Data Size atom. -1. The value encoded in the Data Size atom, which is the length of all the data following it. - -=cut -sub read_size { - my ($self) = @_; - my $t = $self->_getc; - my $i = 0; - my $mask = 1<<7; - - if (ord($t) == 0) { - croak "Matroska Syntax error: first byte of data size was \\0" - } - until (ord($t) & $mask) { - ++$i; - $mask >>= 1; - } - $t = $t & chr($mask-1); # strip length bits (keep only significant bits) - return ($i+1, _bin2int $t . $self->readlen($i)); -} - -=method read_str($length) - -Reads a string of length C<$length> bytes from the internal filehandle. -The string is already Ld from C, which is the -standard Matroska string encoding. - -=cut -{ - my $utf8 = find_encoding("UTF-8"); - sub read_str { - my ($self, $length) = @_; - return $utf8->decode($self->readlen($length)); - } -} - -=method read_uint($length) - -Reads an unsigned integer of length C<$length> bytes -from the internal filehandle. - -Returns a L object if C<$length> is greater -than 4. - -=cut -sub read_uint { - my ($self, $length) = @_; - return _bin2int $self->readlen($length); -} - -=method read_sint($length) - -Reads a signed integer of length C<$length> bytes -from the internal filehandle. - -Returns a L object if C<$length> is greater -than 4. - -=cut -sub read_sint { - my ($self, $length) = @_; - my $i = $self->read_uint($length); - - # Apply 2's complement to the unsigned int - my $mask = int(2 ** ($length * 8 - 1)); - # if the most significant bit is set... - if ($i & $mask) { - # subtract the MSB twice - $i -= 2 * $mask; - } - return $i; -} - -=method read_float($length) - -Reads an IEEE floating point number of length C<$length> -bytes from the internal filehandle. - -Only lengths C<4> and C<8> are supported (C C and C). - -=cut -{ - my $b1 = new Math::BigInt 1; - - sub read_float { - my ($self, $length) = @_; - my $i = new Math::BigInt $self->read_uint($length)->bstr; - my $f; - - # These evil expressions reinterpret an unsigned int as IEEE binary floats - if ($length == 4) { - $f = _ldexp(($i & ((1<<23) - 1)) + (1<<23), ($i>>23 & ((1<<8) - 1)) - 150); - $f = -$f if $i & ($b1<<31); - } elsif ($length == 8) { - $f = _ldexp(($i & (($b1<<52) - 1)) + ($b1<<52), ($i>>52 & ((1<<12) - 1)) - 1075); - $f = -$f if $i & ($b1<<63); - } else { - croak "Matroska Syntax error: unsupported IEEE float byte size $length"; - } - - return $f; - } -} - -=method read_ebml_id($length) - -Reads an EBML ID when it's encoded as the data inside another -EBML element, that is, when the enclosing element's C is -C. - -This returns a hashref with the EBML element description as -defined in L. - -=cut -sub read_ebml_id { - my ($self, $length) = @_; - return elem_by_hexid(unpack("H*", $self->readlen($length))); -} - -=method skip($length) - -Skips C<$length> bytes in the internal filehandle. - -=cut -sub skip { - my ($self, $len) = @_; - return if $self->{fh}->can('seek') && $self->{fh}->seek($len, 1); - $self->readlen($len); - return; -} - -=method getpos - -Wrapper for Lgetpos> in the internal filehandle. - -Returns undef if the internal filehandle can't C. - -=cut -sub getpos { - my ($self) = @_; - return undef unless $self->{fh}->can('getpos'); - return $self->{fh}->getpos; -} - -=method setpos($pos) - -Wrapper for Lsetpos> in the internal filehandle. - -Returns C if the internal filehandle can't C. - -Croaks if C does not seek to the requested position, -that is, if calling C does not yield the same object -as the C<$pos> argument. - -=cut -sub setpos { - my ($self, $pos) = @_; - return undef unless $pos && $self->{fh}->can('setpos'); - - my $ret = $self->{fh}->setpos($pos); - croak "Cannot seek to correct position" - unless $self->getpos eq $pos; - return $ret; -} - -=method read_element($read_bin) - -Reads a full EBML element from the internal filehandle. - -Returns a L object initialized with -the read data. If C is not present or is false, will -delay-load the contents of C type elements, that is, -they will only be loaded when calling C on the -returned L object. - -Does not read the children of the element if its type is -C. Look into the L interface -for details in how to read children elements. - -Pass a true C<$read_bin> if the stream being read is not -seekable (C is undef) and the contents of C -elements is desired, otherwise seeking errors or internal -filehandle corruption might occur. - -=cut -sub read_element { - my ($self, $read_bin) = @_; - return undef if $self->{fh}->eof; - - my $elem_pos = $self->getpos; - - my $elid = $self->read_id; - my $elem_def = elem_by_hexid($elid); - my ($size_len, $content_len) = $self->read_size; - my $full_len = length($elid)/2 + $size_len + $content_len; - - my $elem = Parse::Matroska::Element->new( - elid => $elid, - name => $elem_def && $elem_def->{name}, - type => $elem_def && $elem_def->{valtype}, - size_len => $size_len, - content_len => $content_len, - full_len => $full_len, - reader => $self, - elem_pos => $elem_pos, - data_pos => $self->getpos, - ); - weaken($elem->{reader}); - - if (defined $elem_def) { - if ($elem->{type} eq 'sub') { - $elem->{value} = []; - } elsif ($elem->{type} eq 'str') { - $elem->{value} = $self->read_str($content_len); - } elsif ($elem->{type} eq 'ebml_id') { - $elem->{value} = $self->read_ebml_id($content_len); - } elsif ($elem->{type} eq 'uint') { - $elem->{value} = $self->read_uint($content_len); - } elsif ($elem->{type} eq 'sint') { - $elem->{value} = $self->read_sint($content_len); - } elsif ($elem->{type} eq 'float') { - $elem->{value} = $self->read_float($content_len); - } elsif ($elem->{type} eq 'skip') { - $self->skip($content_len); - } elsif ($elem->{type} eq 'binary') { - if ($read_bin) { - $elem->{value} = $self->readlen($content_len); - } else { - $self->skip($content_len); - } - } else { - die "Matroska Definition error: type $elem->{valtype} unknown" - } - } else { - $self->skip($content_len); - } - return $elem; -} - -1; - -=head1 CAVEATS - -Children elements have to be processed as soon as an element -with children is found, or their children ignored with -L. Not doing so doesn't cause -errors but results in an invalid structure, with constant C<0> -depth. - -To work correctly in unseekable streams, either the contents -of C-type elements has to be ignored or the C -flag to C has to be true. diff --git a/TOOLS/lib/Parse/Matroska/Utils.pm b/TOOLS/lib/Parse/Matroska/Utils.pm deleted file mode 100644 index 127d626cb1..0000000000 --- a/TOOLS/lib/Parse/Matroska/Utils.pm +++ /dev/null @@ -1,37 +0,0 @@ -use strict; -use warnings; - -# ABSTRACT: internally-used helper functions -package Parse::Matroska::Utils; - -use Exporter; -our @ISA = qw{Exporter}; -our @EXPORT_OK = qw{uniq uncamelize}; - -=method uniq(@array) - -The same as L. -Included to avoid depending on it since it's -not a core module. - -=cut -sub uniq(@) { - my %seen; - return grep { !$seen{$_}++ } @_; -} - -=method uncamelize($string) - -Converts a "StringLikeTHIS" into a -"string_like_this". - -=cut -sub uncamelize($) { - local $_ = shift; - # lc followed by UC: lc_UC - s/(?<=[a-z])([A-Z])/_\L$1/g; - # UC followed by two lc: _UClclc - s/([A-Z])(?=[a-z]{2})/_\L$1/g; - # strip leading _ that the second regexp might add; lowercase all - s/^_//; lc -} -- cgit v1.2.3