package Net::Yadis::HTMLParse; use strict; use Exporter; our @ISA = qw(Exporter); our @EXPORT_OK = qw(parseMetaTags); my $htmlre = qr{ # Starts with the tag name at a word boundary, where the tag name is # not a namespace ", hopefully attributes. ([^>]*?) (?: # Match a short tag /> | # Match a full tag > # contents (.*?) # Closed by (?: # One of the specified close tags # End of the string | \Z ) ) }isox; my $headre = qr{ # Starts with the tag name at a word boundary, where the tag name is # not a namespace ", hopefully attributes. ([^>]*?) (?: # Match a short tag /> | # Match a full tag > # match the contents of the full tag (.*?) # Closed by (?: # One of the specified close tags # End of the string | \Z ) ) }soxi; # http-equiv = $2 || $3 # content = $5 || $6 my $tagre = qr{ ? }sixo; my $removere = qr{ # Comments # CDATA blocks | # script blocks | ]*>.*? }soix; my %replacements = ( 'amp' => '&', 'lt' => '<', 'gt' => '>', 'quot' => '"', ); sub parseMetaTags { my ($html) = @_; $html =~ s/$removere//; $html =~ $htmlre or return (); my $htmlcontents = $2; $htmlcontents =~ $headre or return (); my $head = $2; defined $head or return (); my %headerhash; foreach my $tag ($head =~ /$tagre/g) { my ($httpequiv,$content) = ($2 || $3, $5 || $6); for my $pat (keys %replacements) { $httpequiv =~ s/&$pat;/$replacements{$pat}/g; $content =~ s/&$pat;/$replacements{$pat}/g; } $headerhash{lc($httpequiv)}=$content; } return \%headerhash; }