package Net::Yadis::HTMLParse;
use strict;
use Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(parseMetaTags);
my $htmlre = qr{
# Starts with the tag name at a word boundary, where the tag name is
# not a namespace
<html\b(?!:)
# All of the stuff up to a ">", hopefully attributes.
([^>]*?)
(?: # Match a short tag
/>
| # Match a full tag
>
# contents
(.*?)
# Closed by
(?: # One of the specified close tags
</?html\s*>
# End of the string
| \Z
)
)
}isox;
my $headre = qr{
# Starts with the tag name at a word boundary, where the tag name is
# not a namespace
<head\b(?!:)
# All of the stuff up to a ">", hopefully attributes.
([^>]*?)
(?: # Match a short tag
/>
| # Match a full tag
>
# match the contents of the full tag
(.*?)
# Closed by
(?: # One of the specified close tags
</?(?:head|body)\s*>
# End of the string
| \Z
)
)
}soxi;
# http-equiv = $2 || $3
# content = $5 || $6
my $tagre = qr{
<meta\s+http-equiv=
(?:
# between matching quote marks
(["'])(.*?)\1
|
# or up to whitespace
([^"'\s]+)
)
\s*
content=
(?:
# between matching quote marks
(["'])(.*?)\4
|
# or up to whitespace
([^"'\s]+)
)
\s*
/?>?
}sixo;
my $removere = qr{
# Comments
<!--.*?-->
# CDATA blocks
| <!\[CDATA\[.*?\]\]>
# script blocks
| <script\b
# make sure script is not an XML namespace
(?!:)
[^>]*>.*?</script>
}soix;
my %replacements = (
'amp' => '&',
'lt' => '<',
'gt' => '>',
'quot' => '"',
);
sub parseMetaTags {
my ($html) = @_;
$html =~ s/$removere//;
$html =~ $htmlre or return ();
my $htmlcontents = $2;
$htmlcontents =~ $headre or return ();
my $head = $2;
defined $head or return ();
my %headerhash;
foreach my $tag ($head =~ /$tagre/g) {
my ($httpequiv,$content) = ($2 || $3, $5 || $6);
for my $pat (keys %replacements) {
$httpequiv =~ s/&$pat;/$replacements{$pat}/g;
$content =~ s/&$pat;/$replacements{$pat}/g;
}
$headerhash{lc($httpequiv)}=$content;
}
return \%headerhash;
}
syntax highlighted by Code2HTML, v. 0.9.1