package Pod::XML; # $Id: XML.pm 30 2007-02-03 16:50:07Z matt $ use strict; use warnings; use vars qw(@ISA $VERSION); use Pod::Parser; @ISA = ( 'Pod::Parser' ); $VERSION = '0.99'; # I'm not sure why Matt Sergeant did this in this way but I'll leave it for # the time being my %head2sect = ( 1 => "sect1", 2 => "sect2", 3 => "sect3", 4 => "sect4", ); # a hash array of HTML escape codes my %HTML_Escapes = ( "apos" => "#x27", # apostrophe "Aacute" => "#xC1", # capital A, acute accent "aacute" => "#xE1", # small a, acute accent "Acirc" => "#xC2", # capital A, circumflex accent "acirc" => "#xE2", # small a, circumflex accent "AElig" => "#xC6", # capital AE diphthong (ligature) "aelig" => "#xE6", # small ae diphthong (ligature) "Agrave" => "#xC0", # capital A, grave accent "agrave" => "#xE0", # small a, grave accent "Aring" => "#xC5", # capital A, ring "aring" => "#xE5", # small a, ring "Atilde" => "#xC3", # capital A, tilde "atilde" => "#xE3", # small a, tilde "Auml" => "#xC4", # capital A, dieresis or umlaut mark "auml" => "#xE4", # small a, dieresis or umlaut mark "Ccedil" => "#xC7", # capital C, cedilla "ccedil" => "#xE7", # small c, cedilla "Eacute" => "#xC9", # capital E, acute accent "eacute" => "#xE9", # small e, acute accent "Ecirc" => "#xCA", # capital E, circumflex accent "ecirc" => "#xEA", # small e, circumflex accent "Egrave" => "#xC8", # capital E, grave accent "egrave" => "#xE8", # small e, grave accent "ETH" => "#xD0", # capital Eth, Icelandic "eth" => "#xF0", # small eth, Icelandic "Euml" => "#xCB", # capital E, dieresis or umlaut mark "euml" => "#xEB", # small e, dieresis or umlaut mark "Iacute" => "#xCD", # capital I, acute accent "iacute" => "#xED", # small i, acute accent "Icirc" => "#xCE", # capital I, circumflex accent "icirc" => "#xEE", # small i, circumflex accent "Igrave" => "#xCD", # capital I, grave accent "igrave" => "#xED", # small i, grave accent "Iuml" => "#xCF", # capital I, dieresis or umlaut mark "iuml" => "#xEF", # small i, dieresis or umlaut mark "Ntilde" => "#xD1", # capital N, tilde "ntilde" => "#xF1", # small n, tilde "Oacute" => "#xD3", # capital O, acute accent "oacute" => "#xF3", # small o, acute accent "Ocirc" => "#xD4", # capital O, circumflex accent "ocirc" => "#xF4", # small o, circumflex accent "Ograve" => "#xD2", # capital O, grave accent "ograve" => "#xF2", # small o, grave accent "Oslash" => "#xD8", # capital O, slash "oslash" => "#xF8", # small o, slash "Otilde" => "#xD5", # capital O, tilde "otilde" => "#xF5", # small o, tilde "Ouml" => "#xD6", # capital O, dieresis or umlaut mark "ouml" => "#xF6", # small o, dieresis or umlaut mark "szlig" => "#xDF", # small sharp s, German (sz ligature) "THORN" => "#xDE", # capital THORN, Icelandic "thorn" => "#xFE", # small thorn, Icelandic "Uacute" => "#xDA", # capital U, acute accent "uacute" => "#xFA", # small u, acute accent "Ucirc" => "#xDB", # capital U, circumflex accent "ucirc" => "#xFB", # small u, circumflex accent "Ugrave" => "#xD9", # capital U, grave accent "ugrave" => "#xF9", # small u, grave accent "Uuml" => "#xDC", # capital U, dieresis or umlaut mark "uuml" => "#xFC", # small u, dieresis or umlaut mark "Yacute" => "#xDD", # capital Y, acute accent "yacute" => "#xFD", # small y, acute accent "yuml" => "#xFF", # small y, dieresis or umlaut mark "lchevron" => "#xAB", # left chevron (double less than) "rchevron" => "#xBB", # right chevron (double greater than) ); sub html_escape { my $text = shift || ''; # ampersand MUST be done first! $text =~ s/&/\&/g; # handle < and > too $text =~ s//\>/g; # convert other {tag:...} markers $text =~ s/{tag:escape ref='([^']*)'}/\&$1;/g; return $text; } sub finalise_output { my $parser = shift; # put something pretty together $parser->{xml_string} = "{Encoding} || "iso-8859-1" ) . "'?>\n" . "\n" . "\n" . "" . html_escape ( $parser->{title} ) . "\n" . "\n" . $parser->{xml_string} . "\n"; if ( $parser->{Encoding} ) { my $tmp = Encode::encode ( $parser->{Encoding}, $parser->{xml_string} ); $parser->{xml_string} = $tmp; } if ( ! $parser->{send_to_string} ) { my $fh = $parser->output_handle (); print $fh $parser->{xml_string}; } } sub xml_output { my ( $parser, @strings ) = @_; $parser->{xml_string} .= join ( '', @strings ); } sub begin_pod { my ( $parser ) = @_; if ( $parser->{Encoding} ) { # can we use the Encode module? eval { require Encode; }; die ( "Need Encode module to specify specific output encoding - " . $@ ) if ( $@ ); # make sure we can encode to the specific encoding eval { Encode::encode ( $parser->{Encoding}, "" ); }; die ( "Encoding issue - " . $@ ) if ( $@ ); } $parser->{headlevel} = 0; $parser->{seentitle} = 0; $parser->{closeitem} = 0; $parser->{in_begin_block} = 0; $parser->{this_is_name} = 0; $parser->{title} = ''; $parser->{xml_string} = ''; } sub end_pod { my ( $parser ) = @_; while ( $parser->{headlevel} ) { $parser->xml_output ( "{headlevel}-- } . ">\n" ); } $parser->finalise_output; } sub command { my ( $parser, $command, $paragraph ) = @_; $paragraph =~ s/\s*$//; $paragraph =~ s/^\s*//; $paragraph = $parser->interpolate ( $paragraph ); $paragraph = uri_find ( $paragraph ); $paragraph = html_escape ( $paragraph ); $paragraph =~ s/\{(\/?)tag:(.*?)\}/<$1$2>/g; $paragraph =~ s/\{code:(\d+)\}/&#$1/g; if ( $parser->{in_begin_block} == 0 ) { if ( $command =~ /^head(\d+)/ ) { my $headlevel = $1; # we should use "NAME" as the title $parser->{this_is_name}++ if ( $paragraph =~ m/^name$/i && $parser->{this_is_name} == 0 ); if ( $headlevel <= $parser->{headlevel} ) { while ( $headlevel <= $parser->{headlevel} ) { $parser->xml_output ( "{headlevel}-- }, ">\n" ); } } while ( $headlevel > ( $parser->{headlevel} + 1 ) ) { $parser->xml_output ( "<", $head2sect { ++$parser->{headlevel} }, ">\n" ); } $parser->{headlevel} = $headlevel; $parser->xml_output ( "<", $head2sect { $headlevel }, ">\n", "", $paragraph, "\n" ); } elsif ( $command eq "over" ) { if ( $parser->{closeitem} ) { $parser->xml_output ( "\n" ); $parser->{closeitem} = 0; } $parser->xml_output ( "\n" ); } elsif ( $command eq "back" ) { if ( $parser->{closeitem} ) { $parser->xml_output ( "\n" ); $parser->{closeitem} = 0; } $parser->xml_output ( "\n" ); } elsif ( $command eq "item" ) { if ( $parser->{closeitem} ) { $parser->xml_output ( "\n" ); $parser->{closeitem} = 0; } $parser->xml_output ( "" ); if ( $paragraph ne '*' ) { $paragraph =~ s/^\*\s+//; $parser->xml_output ( "", $paragraph, "\n" ); } $parser->{closeitem}++; } elsif ( $command eq 'begin' ) { # this is to strip out =begin ... =end blocks, which aren't generally POD $parser->{in_begin_block} = 1; } } elsif ( $command eq 'end' ) { $parser->{in_begin_block} = 0; } } sub verbatim { my ( $parser, $paragraph ) = @_; return if $parser->{in_begin_block}; if ( $paragraph =~ s/^(\s*)// ) { my $indent = $1; $paragraph =~ s/\s*$//; return unless length $paragraph; $paragraph =~ s/^$indent//mg; # un-indent $paragraph =~ s/\]\]>/\]\]>\]\]>{this_is_name} == 1 ) { # increment, rather than setting back to zero; this way we can ensure # the first NAME is used, but not proceeding ones $parser->{this_is_name}++; $parser->{title} = ""; } $parser->xml_output ( "\n" ); } } sub textblock { my ( $parser, $paragraph, $line_num ) = @_; return if $parser->{in_begin_block}; $paragraph =~ s/^\s*//; $paragraph =~ s/\s*$//; my $text = $parser->interpolate ( $paragraph ); $text = uri_find ( $text ); $text = html_escape ( $text ); $text =~ s/\{(\/?)tag:(.*?)\}/<$1$2>/g; $text =~ s/\{code:(\d+)\}/&#$1/g; if ( $parser->{this_is_name} == 1 ) { # increment, rather than setting back to zero; this way we can ensure the # first NAME is used, but not proceeding ones $parser->{this_is_name}++; $parser->{title} = $paragraph; } if ( $parser->{headlevel} == 0 ) { $parser->xml_output ( "\n", $parser->{title}, "\n" ); $parser->{headlevel}++; } $parser->xml_output ( "\n", $text, "\n\n" ); } sub uri_find { my $text = shift || ''; # Code from the Perl Cookbook my $urls = '(https|http|telnet|gopher|file|wais|ftp|mailto)'; my $ltrs = '\w'; my $gunk = '/#~:.?+=&%@!\-'; my $punc = '.:?\-!,'; my $any = $ltrs . $gunk . $punc; my $new; while ( $text =~ m{ \G # anchor to last match place (.*?) # catch stuff before match in $1 \b # start at word boundary ( # BEGIN $2 $urls : # http: (?![:/]) # negative lookahead for : or / [$any]+? # followed by 1 or more allowed charact ) # END $2 (?= # look ahead after $2 [$punc]* # for 0 or more punctuation characters ( [^$any] # followed by a non-URL character | \Z # or alternatively the end of the html ) ) # end of look ahead }igcsox ) { my ( $pre, $url ) = ( $1, $2 ); $new .= $pre; $new .= "\{tag:xlink uri='" . $url . "'\}" . $url . "\{/tag:xlink\}"; } $text =~ /\G(.*)/gcs; $new .= $1 if defined $1; return $new; } sub interior_sequence { my ( $parser, $seq_command, $seq_argument ) = @_; my $fh = $parser->output_handle (); if ( $seq_command eq 'C' ) { return "\{tag:code\}" . $seq_argument . "\{\/tag:code\}"; } elsif ( $seq_command eq 'I' ) { return "\{tag:emphasis\}" . $seq_argument . "\{\/tag:emphasis\}"; } elsif ( $seq_command eq 'B' ) { return "\{tag:strong\}" . $seq_argument . "\{\/tag:strong\}"; } elsif ( $seq_command eq 'S' ) { $seq_argument =~ s/ /\{char:160\}/g; return $seq_argument; } elsif ( $seq_command eq 'F' ) { return "\{tag:filename\}" . $seq_argument . "\{\/tag:filename\}"; } elsif ( $seq_command eq 'X' ) { return "\{tag:index\}" . $seq_argument . "\{\/tag:index\}"; } elsif ( $seq_command eq 'L' ) { # parse L<>, can be any of: # L or L (other page or section in this page) # L (item in a other page) # L (section in other page) # L<"sect"> (same as L) # L (same as L) # L (same as L) # plus any of the above can be prefixed with text| to use # that text as the link text. # Additionally, there can also be; # L # which SHOULD NOT be prepended label| $seq_argument =~ s/[\r\n]/ /g; my $text = $seq_argument; if ( $seq_argument =~ /^([^|]+)\|(.*)$/ ) { $text = $1; $seq_argument = $2; } if ( $seq_argument =~ /^[a-z]+:\//i ) { $text ||= $seq_argument; } elsif ( $seq_argument =~ /^(.*?)\/(.*)$/ ) { # name/ident or name/"sect" my $ident_or_sect = $2; $seq_argument = $1; if ( $ident_or_sect =~ /^\"(.*)\"$/ ) { my $sect = $1; $sect = substr ( $sect, 0, 30 ); $sect =~ s/\s/_/g; $seq_argument .= '#' . $sect; } else { $seq_argument .= '#' . $ident_or_sect; } } elsif ( $seq_argument =~ /^\\?\"(.*)\"$/ ) { my $sect = $1; $sect = substr ( $sect, 0, 30 ); $sect =~ s/\s/_/g; $seq_argument = '#' . $sect; } return "\{tag:link xref='" . $seq_argument . "'\}" . $text . "\{\/tag:link\}"; } elsif ( $seq_command eq 'E' ) { # E<> codes can be numerical! if ( $seq_argument =~ m/^(0[0-7]+|[0-9]+)$/ ) { # it's octal, convert to decimal! $seq_argument = oct ( $seq_argument ) if $seq_argument =~ m/^0/; # convert to hex $seq_argument = sprintf ( '0x%x', $seq_argument ); } if ( $seq_argument =~ m/^0x([0-9A-Fa-f]{2,4})$/ ) { # E<> is hex! $seq_argument = "#x" . $1; } else { # if we know about this code then translate it into hex if ( exists $HTML_Escapes { $seq_argument } ) { $seq_argument = $HTML_Escapes { $seq_argument }; } } # probably a HTML escape code $seq_argument = "{tag:escape ref='" . $seq_argument . "'}"; return $seq_argument; } } 1; __END__ =head1 NAME Pod::XML - Module to convert POD to XML =head1 SYNOPSIS use Pod::XML; my $parser = Pod::XML->new(); $parser->parse_from_file("foo.pod"); =head1 DESCRIPTION This module uses Pod::Parser to parse POD and generates XML from the resulting parse stream. It uses its own format, described below. =head1 OPTIONS =over 2 =item Encoding Specify the output encoding of the XML file. Requires Encode Perl module. =back =head1 XML FORMAT The XML format is not a standardised format - if you wish to generate some standard XML format such as docbook, please use a tool such as XSLT to convert between this and that format. The format uses the namespace "http://axkit.org/ns/2000/pod2xml". Do not try and request this URI - it is virtual. You will get a 404. The best way to describe the format is to show you: The first =head1 goes in here Subsequent =head1's create a sect1 Ordinary paragraphs of text create a para tag. =head2's go in sect2 Up to =head4 is supported (despite not really being supported by pod), producing sect3 and sect4 respectively for =head3 and =head4. Bold text goes in a strong tag. Italic text goes in a emphasis tag. Code goes in a code tag. Lists (=over, =item, =back) go in list/item/itemtext tags. The itemtext element is only present if the =item text is not the "*" character. If the first =head1 is "NAME" (like standard perl modules are supposed to be) it takes the next paragraph as the document title. Other standard head elements of POD are left unchanged (particularly, the SYNOPSIS and DESCRIPTION elements of standard POD). Pod::XML tries to be careful about nesting sects based on the head level in the original POD. Let me know if this doesn't work for you. =head1 AUTHOR Original version by Matt Sergeant, matt@sergeant.org Version 0.95+ by Matt Wilson Ematt@mattsscripts.co.ukE =head1 MAINTAINER Matt Wilson Ematt@mattsscripts.co.ukE =head1 SEE ALSO L =head1 LICENSE This is free software, you may use it and distribute it under the same terms as Perl itself. =cut # vim:ts=2:sw=2:et