# $Id: HTMLWriter.pm,v 1.7 2003/03/30 09:47:44 matt Exp $
package XML::Handler::HTMLWriter;
use strict;
use vars qw($VERSION @ISA);
$VERSION = '2.01';
use XML::SAX::Writer ();
use HTML::Entities ();
@ISA = ('XML::SAX::Writer');
sub new {
my $class = shift;
my $opt = (@_ == 1) ? { %{shift()} } : {@_};
$opt->{Writer} = 'XML::SAX::Writer::HTML';
return XML::SAX::Writer->new($opt);
my $opt = XML::SAX::Writer->new(@_);
@ISA = (ref($opt));
return bless $opt, $class;
}
package XML::SAX::Writer::HTML;
use strict;
use XML::SAX::Writer::XML;
use vars qw(@ISA);
# NB: this only works because of how hacky XML::SAX::Writer is ;-)
@ISA = ('XML::SAX::Writer::XML');
sub print {
my $self = shift;
$self->{Consumer}->output($self->{Encoder}->convert(join('', @_)));
}
sub escape_attrib {
my $self = shift;
my $text = shift;
$text =~ s/&(?!\{)/&/g;
$text =~ s/"/"/g;
return $text;
}
sub escape_url {
my $self = shift;
my $toencode = shift;
$toencode =~ s/&(?!\{)/&/g;
$toencode =~ s/([^a-zA-Z0-9_.&;-])/uc sprintf("%%%02x",ord($1))/eg;
return $toencode;
}
sub escape_html {
my $self = shift;
return HTML::Entities::encode(join('', @_));
}
my @html_tags = qw(
a abbr acronym address
applet area b base
basefont bdo big blockquote
body br button caption
center cite code col
colgroup dd del dfn
dir div dl dt
em fieldset font form
frame frameset h1 h2
h3 h4 h5 h6
head hr html i
iframe img input ins
isindex kbd label legend
li link map menu
meta noframes noscript object
ol optgroup option p
param pre q s
samp script select small
span strike strong style
sub sup table tbody
td textarea tfoot th
thead title tr tt
u ul var
);
sub is_html_tag {
my $self = shift;
my $tag = lc(shift);
return grep /^$tag$/, @html_tags;
}
my @empty_tags = qw(
area base basefont
br col frame hr img
input isindex link
meta param
);
sub is_empty_tag {
my $self = shift;
my $tag = lc(shift);
return grep /^$tag$/, @empty_tags;
}
my @uri_attribs = qw(
form/action
body/background
blockquote/cite
q/cite
del/cite
ins/cite
object/classid
object/codebase
applet/codebase
object/data
a/href
area/href
link/href
base/href
img/longdesc
frame/longdesc
iframe/longdesc
head/profile
script/src
input/src
frame/src
iframe/src
img/src
img/usemap
input/usemap
object/usemap
);
sub is_url_attrib {
my $self = shift;
my $test = lc(shift);
return grep /^$test$/, @uri_attribs;
}
my @bool_attribs = qw(
input/checked
dir/compact
dl/compact
menu/compact
ol/compact
ul/compact
object/declare
script/defer
button/disabled
input/disabled
optgroup/disabled
option/disabled
select/disabled
textarea/disabled
img/ismap
input/ismap
select/multiple
area/nohref
frame/noresize
hr/noshade
td/nowrap
th/nowrap
textarea/readonly
input/readonly
option/selected
);
sub is_boolean_attrib {
my $self = shift;
my $test = lc(shift);
return grep /^$test$/, @bool_attribs;
}
sub start_document {
my ($self, $doc) = @_;
undef $self->{FirstElement};
$self->SUPER::start_document($doc);
}
sub start_element {
my ($self, $element) = @_;
$element->{Parent} = $self->{Current_Element};
$self->{Current_Element} = $element;
if (!$self->{FirstElement}) {
$self->{FirstElement}++;
if (lc($element->{Name}) ne 'html' || $element->{NamespaceURI}) {
die "First element has to be ";
}
if ($self->{DoctypePublic}) {
$self->print(qq({DoctypeSystem}) {
$self->print(qq( "$self->{DoctypeSystem}"));
}
$self->print(">\n");
}
elsif ($self->{DoctypeSystem}) {
$self->print(qq(\n));
}
else {
$self->print(
qq(\n)
);
}
}
if (!$element->{NamespaceURI} && $self->is_html_tag($element->{Name})) {
# HTML special cases...
$self->print("<$element->{Name}");
foreach my $attr (values %{$element->{Attributes}}) {
my $test = "$element->{LocalName}/$attr->{Name}";
if ($self->is_boolean_attrib($test)) {
$self->print(" $attr->{Name}");
}
elsif ($self->is_url_attrib($test)) {
$self->print(" $attr->{Name}=\"", $self->escape_url($attr->{Value}), "\"");
}
else {
$self->print(" $attr->{Name}=\"",
$self->escape_attrib($attr->{Value}),
"\"");
}
}
$self->print(">");
if (lc($element->{LocalName}) eq 'script') {
$self->print("\n") unless $self->{NoScriptComment};
}
$self->print("$element->{Name}>");
}
else {
$self->SUPER::end_element($element);
}
}
sub characters {
my ($self, $chars) = @_;
my $element = $self->{Current_Element};
if (!$element->{NamespaceURI} && $self->is_html_tag($element->{LocalName})) {
if (lc($element->{LocalName}) =~ /^(script|style)$/) {
$self->print($chars->{Data});
}
else {
$self->print($self->escape_html($chars->{Data}));
}
}
else {
$self->SUPER::characters($chars);
}
}
sub processing_instruction {
my ($self, $pi) = @_;
if (length $pi->{Data}) {
$self->print("", $pi->{Target}, " ", $pi->{Data}, ">");
}
else {
$self->print("", $pi->{Target}, ">");
}
}
sub comment {
my ($self, $comment) = @_;
# strip comments?
}
1;
__END__
=head1 NAME
XML::Handler::HTMLWriter - SAX Handler for writing HTML 4.0
=head1 SYNOPSIS
use XML::Handler::HTMLWriter;
use XML::SAX;
my $writer = XML::Handler::HTMLWriter->new(...);
my $parser = XML::SAX::ParserFactory->parser(Handler => $writer);
...
=head1 DESCRIPTION
This module is based on the rules for outputting HTML according to
http://www.w3.org/TR/xslt - the XSLT specification. It is a subclass of
XML::SAX::Writer, and the usage is the same as that module.
=head1 Usage
=head2 First create a new HTMLWriter object:
my $writer = XML::Handler::HTMLWriter->new(...);
The ... indicates parameters to be passed in. These are all passed
in using the hash syntax: Key => Value.
All parameters are from XML::SAX::Writer, so please see its documentation
for more details.
=head2 Now pass $writer to a SAX chain:
e.g. a SAX parser:
my $parser = XML::SAX::ParserFactory->parser(Handler => $writer);
Or a SAX filter:
my $tolower = XML::Filter::ToLower->new(Handler => $writer);
Or use in a SAX Machine:
use XML::SAX::Machines qw(Pipeline);
Pipeline(
XML::Filter::XSLT->new(Source => { SystemId => 'foo.xsl' })
=>
XML::Handler::HTMLWriter->new
)->parse_uri('foo.xml');
=head2 Initiate processing
XML::Handler::HTMLWriter never initiates processing itself, since it is
just a recepticle for SAX events. So you have to start processing on one
of the modules higher up the chain. For example in the XML::SAX parser
case:
$parser->parse(Source => { SystemId => "foo.xhtml" });
=head2 Get the results
Results work via the consumer interface as defined in XML::SAX::Writer.
=head1 HTML Output Methodology
Here is the relevant excerpt from TR/xslt [note that a bit of an
understanding of XSLT is necessary to read this, but don't worry -
understanding isn't necessary to use this module :-)]:
The html output method should not output an element differently from
the xml output method unless the expanded-name of the element has a
null namespace URI; an element whose expanded-name has a non-null
namespace URI should be output as XML. If the expanded-name of the
element has a null namespace URI, but the local part of the
expanded-name is not recognized as the name of an HTML element, the
element should output in the same way as a non-empty, inline element
such as span.
The html output method should not output an end-tag for empty
elements. For HTML 4.0, the empty elements are area, base, basefont,
br, col, frame, hr, img, input, isindex, link, meta and param. For
example, an element written as or in the stylesheet
should be output as .
The html output method should recognize the names of HTML elements
regardless of case. For example, elements named br, BR or Br should all
be recognized as the HTML br element and output without an end-tag.
The html output method should not perform escaping for the content of
the script and style elements. For example, a literal result element
written in the stylesheet as
or
should be output as
The html output method should not escape < characters occurring in
attribute values.
If the indent attribute has the value yes, then the html output method
may add or remove whitespace as it outputs the result tree, so long as
it does not change how an HTML user agent would render the output. The
default value is yes.
The html output method should escape non-ASCII characters in URI
attribute values using the method recommended in Section B.2.1 of the
HTML 4.0 Recommendation.
The html output method may output a character using a character entity
reference, if one is defined for it in the version of HTML that the
output method is using.
The html output method should terminate processing instructions with >
rather than ?>.
The html output method should output boolean attributes (that is
attributes with only a single allowed value that is equal to the name
of the attribute) in minimized form. For example, a start-tag written
in the stylesheet as