# # Copyright (c) 2001 by Jim Menard # # Released under the same license as Ruby. See # http://www.ruby-lang.org/en/LICENSE.txt. # require 'nqxml/error' require 'nqxml/entities' require 'nqxml/utils' module NQXML class Input attr_accessor :pos attr_reader :string, :length, :replaceRefs def initialize(str, replaceRefs) @string = str @replaceRefs = replaceRefs @pos = 0 @length = @string.length end def eof? return @pos >= @length end end class Tokenizer # Don't need \xd because we either strip them or turn them into \xa # on the way in. NOT_SPACES_REGEX = /[^\x20\x9\xa]/m # Unnecessary backslashes, yes, but I was getting funky errors PUBLIC_ID_LITERAL_REGEX = /\A[-a-zA-Z0-9\'\{\}\+\,\.\/\:\=\?\;!\*\#\@\$\_\%\x20\xa]+\z/ NAME_REGEX = /^([a-zA-Z_:\xc0-\xd6\xd8-\xf6\xf8-\xff][-a-zA-Z0-9_:\.\xc0-\xd6\xd8-\xf6\xf8-\xff\xb7]*)/ ENCODING_NAME_REGEX = /\A[A-Za-z][-A-Za-z0-9\._]+\z/ BRACKET_OR_AMP_REGEX = /[<&]/ def initialize(stringOrReadable) xml = nil @inputStack = Array.new() if stringOrReadable.kind_of?(String) xml = stringOrReadable elsif stringOrReadable.respond_to?(:read) xml = stringOrReadable.read() else str = "illegal argument: #{stringOrReadable} must be a" + "String or have a 'read' method" raise ParserError.new(str, self) end # All line breaks normalized on input to #xA. xml.gsub!("\r\n", "\n") xml.gsub!("\r", "\n") @inputStack.push(@currInput = Input.new(xml, true)) @generatedEndTag = nil @internalEntities = Hash.new() @paramEntities = Hash.new() end def eof? # return @generatedEndTag.nil? && @inputStack[0].eof? return @generatedEndTag.nil? && @inputStack[0].pos >= @inputStack[0].length end alias_method :atEnd, :eof? alias_method :eof, :eof? # Returns the line number within the XML. def line return -1 if @inputStack.empty? # only if initialize() arg is bogus input = @inputStack[0] # not @inputStack.last str = input.string[0 .. input.pos] return str.count("\n") + 1 end # Returns the column number within the current line of XML. def column return -1 if @inputStack.empty? # only if initialize() arg is bogus input = @inputStack[0] # not @inputStack.last currLineStart = input.string.rindex("\n", input.pos) return input.pos + 1 if currLineStart.nil? return input.pos - currLineStart + 1 end # Replace general entity refs, but not predefined and character refs. # Returns the replacement string if any substitutions were performed. # If not, returns nil to signal end of recursive replacement. def replaceOnlyEntityRefs(str) return nil if str.nil? copy = str.dup replacement = '' while copy =~ /&([^&;]*);/mn replacement << $` ref = $& refName = $1 copy = $' if refName =~ /\A(#[0-9]*|#x[0-9A-Fa-f]|amp|quot|apos|gt|lt)\z/n replacement << ref else val = @internalEntities[refName] if val.nil? str = "entity reference '#{ref}' is undefined" raise ParserError.new(str, self) end val = replaceAllRefsButParams(val) replacement << val end end replacement << copy return replacement end # Replace character, predefined, and general entity refs. def replaceAllRefsButParams(str) return nil if str.nil? copy = NQXML.replaceCharacterRefs(str) copy = NQXML.replacePredefinedEntityRefs(copy) return replaceOnlyEntityRefs(copy) end # Parse character refs and parameter refs at the same time, # but not general entity refs. def replaceParamRefs(str) return nil if str.nil? copy = str.dup replacement = '' while copy =~ /%([^%;]*);/mn replacement << $` ref = $& refName = $1 copy = $' val = @paramEntities[refName] if val.nil? str = "entity reference '#{ref}' is undefined" raise ParserError.new(str, self) end val = replaceParamRefs(val) replacement << val end replacement << copy return replacement end # Return the currently-used input object. If the input on the top # of the stack is empty, pop it off the stack and return the next one. # # Now inlined everywhere this used to be called. # def input # while @currInput.pos >= @currInput.length # @inputStack.pop() # @currInput = @inputStack.last # end # return @currInput # end # Returns true if str matches the beginning of the current # position in input stream. def peekMatches?(str) while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end return @currInput.string[@currInput.pos, str.length] == str end def peekChar() while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end return @currInput.string[@currInput.pos, 1] end def nextChar(n = 1) while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end str = @currInput.string[@currInput.pos, n] @currInput.pos += n return str end alias_method :nextChars, :nextChar def skipChar(n = 1) while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end @currInput.pos += n end alias_method :skipChars, :skipChar def skipSpaces while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end endOfSpaces = @currInput.string.index(NOT_SPACES_REGEX, @currInput.pos) if endOfSpaces.nil? # There's nothing but whitespace from here until end of the # current input stream. Pop this input and continue searching. @currInput.pos = @currInput.length while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end skipSpaces() if @currInput else @currInput.pos = endOfSpaces end end # Returns text up to but not including specified string or regexp. # Positions text cursor after the text found. def textUpTo(str, strIsRegex, errorIfNotFound) while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end if strIsRegex textEnd = (str =~ @currInput.string[@currInput.pos .. -1]) textEnd += @currInput.pos if textEnd else textEnd = @currInput.string.index(str, @currInput.pos) end # Throw error here if no char found and if errorIfNotFound is # true. if textEnd.nil? if errorIfNotFound raise ParserError.new("unexpected EOF: missing #{str}", self) end textEnd = @currInput.length end range = (@currInput.pos ... textEnd) text = @currInput.string[range] skipChars(text.length) return text end # Returns the next legal XML name. Will raise an exception if the # next available character is not legal. def nextName while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end @currInput.string[@currInput.pos .. -1] =~ NAME_REGEX name = $1 if name.nil? str = "expected name but saw illegal non-name character" raise ParserError.new(str, self) end skipChars(name.length) return name end def nextQuotedLiteral(tagName) quote = peekChar() if quote != '"' && quote != "'" str = "quoted literal not quoted in tag #{tagName}" raise ParserError.new(str, self) end skipChar() # eat quote text = textUpTo(quote, false, true) skipChar() # eat quote return text end # Returns a PUBLIC id literal, which is different from a "simple" # quoted literal. def nextPublicIdLiteral(tagName) quote = peekChar() if quote != '"' && quote != "'" str = "quoted literal not quoted in #{tagName} PUBLIC id" raise ParserError.new(str, self) end skipChar() # eat quote text = textUpTo(quote, false, true) if !(text =~ PUBLIC_ID_LITERAL_REGEX) str = "#{tagName} PUBLIC public id literal contains illegal" + ' character(s)' raise ParserError.new(str, self) end skipChar() # eat quote # FIX - we are returning literal text, but should do something # intelligent. return quote + text + quote end # Normalize the attribute value using the rules in section 3.3.3 of # the XML spec, "Attribute-Value Normalization". def normalizeAttributeValue(str) return nil if str.nil? val = str.dup result = '' until val.empty? pos = val =~ /[&\s<]/m if pos.nil? result << val return result end result << $` val = val[pos .. -1] if val =~ /\A&#x([0-9a-f]*);/ni result << $1.hex.chr val = $' elsif val =~ /\A&#([0-9]*);/n result << $1.to_i.chr val = $' elsif val =~ /\A&([^&;]*);/n ref = $1 case ref when 'amp'; result << '&' when 'lt'; result << '<' when 'gt'; result << '>' when 'quot'; result << '"' when 'apos'; result << '\'' else replacement = @internalEntities[ref] if replacement.nil? str = "entity reference '#{ref}' is undefined" raise ParserError.new(str, self) end if !replacement.index('<').nil? str = "attribute values may not contain '<'" raise ParserError.new(str, self) end result << normalizeAttributeValue(replacement) end val = $' elsif val =~ /\A\s+/m result << ' ' val = $' elsif val[0] == ?< str = "attribute values may not contain '<'" raise ParserError.new(str, self) end end return result end # Returns hash of attributes. If no attributes, returns empty hash. # If error, raises an exception. def nextTagAttributes(typeName, name) attrs = Hash.new() skipSpaces() c = peekChar() while !c.nil? && c =~ /[a-zA-Z_:]/ # next legal attrib name char key = nextName() skipSpaces() if peekMatches?('=') skipChar() skipSpaces() val = nextQuotedLiteral(name) val = normalizeAttributeValue(val) else val = '' end # Well-formedness constraint: attribute names may appear # only once. if !attrs[key].nil? str = "malformed #{typeName} '#{name}': attribute name" + " '#{key}' appears more than once" raise ParserError.new(str, self) end attrs[key] = val skipSpaces() c = peekChar() end return attrs end def restOfXMLDecl(name, input, sourceStartPos) # Read attributes, if any. attrs = nextTagAttributes('xml decl', name) # Make sure we close with '?>' skipSpaces() if !peekMatches?('?>') str = "malformed processing instruction '#{name}':" + " missing '?>' after attributes" raise ParserError.new(str, self) end skipChars(2) # eat '?>' if attrs['encoding'] && attrs['encoding'] !~ ENCODING_NAME_REGEX str = "xml encoding name \"#{attrs['encoding']}\" contains" + " illegal characters" raise ParserError.new(str, self) end if attrs['standalone'] && !(attrs['standalone'] == 'yes' || attrs['standalone'] == 'no') str = "xml standalone attribute \"#{attrs['standalone']}\"" + " illegal; must be \"yes\" or \"no\"" raise ParserError.new(str, self) end source = input.string[sourceStartPos ... input.pos] return XMLDecl.new(name, attrs, source) end def restOfProcessingInstruction(name, input, sourceStartPos) text = textUpTo('?>', false, true) # Read remaining text skipChars(2) # eat '?>' source = input.string[sourceStartPos ... input.pos] return ProcessingInstruction.new(name, text, source) end def nextProcessingInstruction while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end sourceStartPos = @currInput.pos - 2 # Get name # do not skip spaces; they are not allowed before the name. # skipSpaces() name = nextName() skipSpaces() if name.downcase == 'xml' return restOfXMLDecl(name, @currInput, sourceStartPos) else return restOfProcessingInstruction(name, @currInput, sourceStartPos) end end # Handle declsep (parameter entity reference) inside DOCTYPE. # Unlike most next* methods, we don't return an entity. Instead we # replace the declsep (parameter entity reference) and create # a new input stream containing the replacement text. def nextDeclSep while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end startPos = @currInput.pos - 1 name = nextName() if !peekMatches?(';') str = "illegal parameter entity reference '%#{name}' is" + " missing trailing semicolon" raise ParserError.new(str, self) end skipChar() # eat ';' # Get replacement text and raise an error if it is not defined. # Per the XML spec, surround replacement with single spaces. if @paramEntities[name].nil? str = "undefined parameter entity '%#{name};' in DOCTYPE tag" raise ParserError.new(str, self) end replacement = ' ' + @paramEntities[name] + ' ' # Create a new input stream. @inputStack.push(@currInput = Input.new(replacement, true)) end def nextComment text = textUpTo('-->', false, true) while text[-1] == '-' # "--->" is not a legal comment ending $stderr.puts "warning: illegal comment end string '--->'" + " seen and ignored" skipChars(3) # eat '-->' text << textUpTo('-->', false, true) end skipChars(3) # eat '-->' return Comment.new("", @internalEntities) end def nextCdata skipChars(7) # eat '[CDATA[' text = textUpTo(']]>', false, true) skipChars(3) # eat ']]>' return Text.new(text, "") end # Return next PUBLIC or SYSTEM external id. Returns nil if the # next word is not either 'PUBLIC' or 'SYSTEM'. def nextExternalId(tagName) externalId = nil idSourceStartPos = @currInput.pos if peekMatches?('PUBLIC') skipChars(6) skipSpaces() pubid = nextPublicIdLiteral(tagName) skipSpaces() c = peekChar() if c == '>' || c == '[' str = "#{tagName} tag's PUBLIC external id requires two" + " arguments; only one seen" raise ParserError.new(str, self) end # c is quote char (we need to hang on to it because we # loose it in nextQuotedLiteral()). If the next char isn't # a quote char, then nextQuotedLiteral() will do the # complaining for us. system = c + nextQuotedLiteral("#{tagName} (PUBLIC)") + c source = @currInput.string[idSourceStartPos ... @currInput.pos] externalId = PublicExternalID.new(pubid, system, source) elsif peekMatches?('SYSTEM') skipChars(6) skipSpaces() c = peekChar() # Grab quote char system = c + nextQuotedLiteral("#{tagName} (SYSTEM)") + c source = @currInput.string[idSourceStartPos ... @currInput.pos] externalId = SystemExternalID.new(system, source) # A bit of extra error checking skipSpaces() if peekChar() == '"' str = "#{tagName} tag's SYSTEM external id only has one" + " argument; two seen" raise ParserError.new(str, self) end end return externalId end def nextEntityTag while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end sourceStartPos = @currInput.pos skipChars(8) # eat '') str = "missing '>' after ENTITY #{errorStrExtra}" + "'#{name}' value" raise ParserError.new(str, self) end skipChar() # eat '>' src = @currInput.string[sourceStartPos ... @currInput.pos] if type == :PARAMETER_ENTITY return ParameterEntityTag.new(name, entityValue, externalId, src) else return GeneralEntityTag.new(name, entityValue, externalId, ndataName, src) end end def nextElementDecl while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end sourceStartPos = @currInput.pos skipChars(9) # eat '', false, true) skipChar() # eat '>' return Element.new(name, args, @currInput.string[sourceStartPos ... @currInput.pos]) end def nextAttributeList while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end sourceStartPos = @currInput.pos skipChars(9) # eat '', false, true) skipChar() # eat '>' return Attlist.new(name, args, @currInput.string[sourceStartPos ... @currInput.pos]) end def nextNotation while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end sourceStartPos = @currInput.pos skipChars(10) # eat '', false, true) skipChar() # eat '>' return Notation.new(name, args, @currInput.string[sourceStartPos ... @currInput.pos]) end # '' def nextDoctype while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end sourceStartPos = @currInput.pos - 2 skipChars(7) # eat 'DOCTYPE' # Name skipSpaces() name = nextName() # External id skipSpaces() externalId = nextExternalId('DOCTYPE') # May return nil # Markupdecl and declsep entities, if any skipSpaces() entities = nil if peekMatches?('[') # markupdecl and declsep entities entities = Array.new() skipChar() # eat '[' skipSpaces() while !peekMatches?(']') if peekMatches?('') raise ParserError.new("DOCTYPE tag missing '>'", self) end skipChar() # eat '>' return Doctype.new(name, externalId, entities, @currInput.string[sourceStartPos ... @currInput.pos]) end def nextBangTag if peekMatches?('--') skipChars(2) return nextComment() elsif peekMatches?('[CDATA[') return nextCdata() elsif peekMatches?('DOCTYPE') nextDoctype() else text = textUpTo('>', false, true) raise ParserError.new("unknown or unexpected tag ", self) end end def nextTag while @currInput.pos >= @currInput.length @inputStack.pop() @currInput = @inputStack.last end sourceStartPos = @currInput.pos - 1 # Determine if negated by starting slash isTagEnd = peekMatches?('/') skipChar() if isTagEnd # Get name skipSpaces() name = nextName() # Read attributes attrs = isTagEnd ? nil : nextTagAttributes('tag', name) # Check for slash at end of tag skipSpaces() c = peekChar() makeNegatedCopy = (c == '/') if makeNegatedCopy if isTagEnd str = "malformed tag '#{name}': slash appears at both" + " beginning and end of tag" raise ParserError.new(str, self) end skipChar() # eat '/' c = peekChar() end if c != '>' str = "malformed tag '#{name}': missing '>' after attributes" raise ParserError.new(str, self) end skipChar() # eat '>' source = @currInput.string[sourceStartPos ... @currInput.pos] if makeNegatedCopy # Create tag for next token and get rid of trailing slash # in source text. @generatedEndTag = Tag.new(name, nil, true, source) end return Tag.new(name, attrs, isTagEnd, source) end def nextBracketToken skipChar() # eat '<' c = peekChar() case c when '?' skipChar() # eat '?' return nextProcessingInstruction() when '!' skipChar() # eat '!' return nextBangTag() else return nextTag() end end def nextText text = '' while true text << textUpTo(BRACKET_OR_AMP_REGEX, true, false) # We are done if this is the end of this input stream or # if the next char is a '<'. NOTE: the check for eof must # come first because peekChar() will go beyond eof to the # next (popped) input stream. if eof?() || peekChar() == '<' return text.empty? ? nil : Text.new(text) end if peekChar() == '&' ref = textUpTo(';', false, true) # '&...' ref << nextChar() # ';' ref = NQXML.replaceCharacterRefs(ref) ref = NQXML.replacePredefinedEntityRefs(ref) if ref[0] == ?& && ref.length > 1 && @inputStack.last.replaceRefs # We are looking at an entity reference, and this # input wants to replace those. replacement = @internalEntities[ref[1..-2]] if replacement.nil? str = "entity reference '#{ref}' is undefined" raise ParserError.new(str, self) end # Before we create a new input object so we can # recursively parse the replacement text, let's # slurp up as much "simple" text as we can. replacement =~ /\A[^<&]*/ text << $& replacement = $' if !replacement.empty? @currInput = Input.new(replacement, false) @inputStack.push(@currInput) moreText = nextText() text << moreText.text if moreText end else # append ref to text and move ion text << ref end end end end # Used by each(). def nextEntity entity = nil if @generatedEndTag entity = @generatedEndTag @generatedEndTag = nil elsif peekMatches?('<') entity = nextBracketToken() else entity = nextText() # nextText() returns nil when a substitution caused a tag # to be introduced into the text. entity = nextEntity() if entity.nil? end return entity end # The main token generator def each while !eof() yield nextEntity() end end end # end of class NQXML::Tokenizer end # end of module NQXML