# HTML5 parsing to LibXML document. # # Written in 2017 by Vincent Lefevre . # Public domain. # # Changes: # 2023-09-21: detect libxml2 "bad name" errors. # 2019-12-16: better error messages; debug support. # 2018-10-24: avoid a fatal error by ignoring the nodes (comments...) # outside the root element. # 2017-08-05: first version. package HTML5ToLibXML; use strict; use Carp; use HTML::Gumbo; use XML::LibXML; our $debug = $ENV{HTML5TOLIBXML_DEBUG}; our $quiet = $ENV{HTML5TOLIBXML_QUIET}; # Void elements: https://www.w3.org/TR/html5/syntax.html#void-elements my %voidelem = map { $_ => 1 } qw(area base br col embed hr img input keygen link meta param source track wbr); sub parse ($) { my ($html) = @_; my $doc = XML::LibXML::Document->createDocument('1.0', 'utf-8'); my @nodes; HTML::Gumbo->new->parse ($html, format => 'callback', callback => sub { my ($event) = shift; warn "Event: $event\n" if $debug; if ($event =~ /^document (start|end)$/ ) { } elsif ($event eq 'start' ) { my ($tag, $attr) = @_; warn " Tag: <$tag>\n" if $debug; my $element; if (@nodes) { $element = $doc->createElement($tag); $nodes[-1]->appendChild($element); } else { $element = $doc->createElementNS('http://www.w3.org/1999/xhtml', $tag); $doc->setDocumentElement($element); } while (@$attr) { @$attr >= 2 or confess "missing attribute value"; my $name = $attr->[0]; warn " Att: $name=\"$attr->[1]\"\n" if $debug; # Gumbo does not signal a parse error for # unexpected-character-in-attribute-name. # Thus we need to detect that by checking # libxml2 "bad name" errors. eval { $element->setAttribute(splice @$attr, 0, 2); }; if ($@) { $@ =~ /^bad name / or die; warn "$0: parse error (bad attribute name $name)" unless $quiet; } } push @nodes, $element unless $voidelem{$tag}; } elsif ($event eq 'end') { $_[0] eq $nodes[-1]->nodeName or confess "internal error"; pop @nodes; } elsif (@nodes) { my $node; if ($event =~ /^(text|space)$/) { $node = $doc->createTextNode($_[0]); } elsif ($event eq 'comment') { $node = $doc->createComment($_[0]); } elsif ($event eq 'cdata') { $node = $doc->createCDATASection($_[0]); } else { confess "unknown event"; } $nodes[-1]->appendChild($node); } } ); return $doc; } 1; =head1 NAME HTML5ToLibXML - HTML5 parsing to LibXML document =head1 SYNOPSIS use HTML5ToLibXML; print HTML5ToLibXML::parse('
foo'); =head1 DESCRIPTION HTML5ToLibXML parses a HTML5 document provided as a string and produces a LibXML document. It uses the L and L modules. Nodes (comments...) outside the root element are not copied. =head1 EXAMPLE The following script takes a HTML5 file in argument or on the standard input and outputs the corresponding XHTML file and the text content. use HTML5ToLibXML; use open ':encoding(UTF-8)'; binmode STDIN, ':encoding(UTF-8)'; my $doc = HTML5ToLibXML::parse(do { local $/; <> }); $doc->toFH(*STDOUT, 0); binmode STDOUT, ':encoding(UTF-8)'; print "Text content: ", $doc->textContent; =head1 AUTHOR Vincent Lefevre =head1 LICENSE Public domain. =cut # $Id: HTML5ToLibXML.pm 161712 2023-09-21 11:11:34Z vinc17/cventin $