# HTML5 parsing to LibXML document.
#
# Written in 2017 by Vincent Lefevre <vincent@vinc17.net>.
# Public domain.
#
# Changes:
#   2023-09-21: detect libxml2 "bad name" errors.
#   2019-12-16: better error messages; debug support.
#   2018-10-24: avoid a fatal error by ignoring the nodes (comments...)
#               outside the root element.
#   2017-08-05: first version.

package HTML5ToLibXML;

use strict;
use Carp;
use HTML::Gumbo;
use XML::LibXML;

our $debug = $ENV{HTML5TOLIBXML_DEBUG};
our $quiet = $ENV{HTML5TOLIBXML_QUIET};

# Void elements: https://www.w3.org/TR/html5/syntax.html#void-elements
my %voidelem = map { $_ => 1 } qw(area base br col embed hr img input keygen
                                  link meta param source track wbr);

sub parse ($)
  {
    my ($html) = @_;
    my $doc = XML::LibXML::Document->createDocument('1.0', 'utf-8');
    my @nodes;

    HTML::Gumbo->new->parse
      ($html, format => 'callback', callback => sub
       {
         my ($event) = shift;
         warn "Event: $event\n" if $debug;
         if ($event =~ /^document (start|end)$/ )
           { }
         elsif ($event eq 'start' )
           {
             my ($tag, $attr) = @_;
             warn "  Tag: <$tag>\n" if $debug;
             my $element;
             if (@nodes)
               {
                 $element = $doc->createElement($tag);
                 $nodes[-1]->appendChild($element);
               }
             else
               {
                 $element =
                   $doc->createElementNS('http://www.w3.org/1999/xhtml', $tag);
                 $doc->setDocumentElement($element);
               }
             while (@$attr)
               {
                 @$attr >= 2 or confess "missing attribute value";
                 my $name = $attr->[0];
                 warn "  Att: $name=\"$attr->[1]\"\n" if $debug;
                 # Gumbo does not signal a parse error for
                 # unexpected-character-in-attribute-name.
                 # Thus we need to detect that by checking
                 # libxml2 "bad name" errors.
                 eval { $element->setAttribute(splice @$attr, 0, 2); };
                 if ($@)
                   {
                     $@ =~ /^bad name / or die;
                     warn "$0: parse error (bad attribute name $name)"
                       unless $quiet;
                   }
               }
             push @nodes, $element unless $voidelem{$tag};
           }
         elsif ($event eq 'end')
           {
             $_[0] eq $nodes[-1]->nodeName or confess "internal error";
             pop @nodes;
           }
         elsif (@nodes)
           {
             my $node;
             if ($event =~ /^(text|space)$/)
               { $node = $doc->createTextNode($_[0]); }
             elsif ($event eq 'comment')
               { $node = $doc->createComment($_[0]); }
             elsif ($event eq 'cdata')
               { $node = $doc->createCDATASection($_[0]); }
             else
               { confess "unknown event"; }
             $nodes[-1]->appendChild($node);
           }
       }
      );

    return $doc;
  }

1;

=head1 NAME

HTML5ToLibXML - HTML5 parsing to LibXML document

=head1 SYNOPSIS

  use HTML5ToLibXML;
  print HTML5ToLibXML::parse('<div>foo');

=head1 DESCRIPTION

HTML5ToLibXML parses a HTML5 document provided as a string and produces a
LibXML document. It uses the L<HTML::Gumbo> and L<XML::LibXML> modules.
Nodes (comments...) outside the root element are not copied.

=head1 EXAMPLE

The following script takes a HTML5 file in argument or on the standard
input and outputs the corresponding XHTML file and the text content.

  use HTML5ToLibXML;

  use open ':encoding(UTF-8)';
  binmode STDIN, ':encoding(UTF-8)';

  my $doc = HTML5ToLibXML::parse(do { local $/; <> });
  $doc->toFH(*STDOUT, 0);

  binmode STDOUT, ':encoding(UTF-8)';
  print "Text content: ", $doc->textContent;

=head1 AUTHOR

Vincent Lefevre <vincent@vinc17.net>

=head1 LICENSE

Public domain.

=cut

# $Id: HTML5ToLibXML.pm 161712 2023-09-21 11:11:34Z vinc17/cventin $