# ------------------------------------------------------------------------------------- # MKDoc::XML::Tokenizer # ------------------------------------------------------------------------------------- # Author : Jean-Michel Hiver. # Copyright : (c) MKDoc Holdings Ltd, 2003 # # This module turns an XML string into a list of tokens and returns this list. # It is using Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions" # # This module is distributed under the same license as Perl itself. # ------------------------------------------------------------------------------------- package MKDoc::XML::Tokenizer; use MKDoc::XML::Token; use strict; use warnings; our $prev_token; # REX/Perl 1.0 # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions", # Technical Report TR 1998-17, School of Computing Science, Simon Fraser # University, November, 1998. # Copyright (c) 1998, Robert D. Cameron. # The following code may be freely used and distributed provided that # this copyright and citation notice remains intact and that modifications # or additions are clearly identified. # # Additions: # ---------- # added 'my' and 'our' keywords in front of variables # I like strict mode :) my $TextSE = "[^<]+"; my $UntilHyphen = "[^-]*-"; my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-"; my $CommentCE = "$Until2Hyphens>?"; my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+"; my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>"; my $S = "[ \\n\\t\\r]+"; my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; my $Name = "(?:$NameStrt)(?:$NameChar)*"; my $QuoteSE = "\"[^\"]*\"|'[^']*'"; my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*"; my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; my $S1 = "[\\n\\r\\t ]"; my $UntilQMs = "[^?]*\\?+"; my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>"; my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S"; my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?"; my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; my $PI_CE = "$Name(?:$PI_Tail)?"; my $EndTagCE = "$Name(?:$S)?>?"; my $AttValSE = "\"[^<\"]*\"|'[^<']*'"; my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?"; my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; our $XML_SPE = "$TextSE|$MarkupSPE"; # Rather than have this: # sub ShallowParse { # my($XML_document) = @_; # return $XML_document =~ /$XML_SPE/g; # } sub process_data { my $class = shift; my $xml = shift; # remove trailing whitespace $xml =~ s/^(?:\s|\r|\n)*\(?:\s|\r|\n)*$/\>/s; local ($prev_token) = ''; my @res = map { _check_001(); _check_002(); $prev_token = $_; bless \$_, 'MKDoc::XML::Token'; } $xml =~ /$XML_SPE/go; return \@res; } #

sub _check_002 { $prev_token =~ /^$/ or die "cannot tokenize: $prev_token$_"; } # sub _check_001 { /^