# # Copyright (c) 2001 by Jim Menard # # Released under the same license as Ruby. See # http://www.ruby-lang.org/en/LICENSE.txt. # require 'nqxml/parser' require 'nqxml/document' module NQXML class TreeParser < Parser def initialize(stringOrReadable) super(stringOrReadable) @document = Document.new() # Start parsing. @nodeStack = Array.new() @documentSection = :DOCUMENT_PROLOG @tokenizer.each { | entity | handleNextEntity(entity) } # Check for remaining open tags @nodeStack.reverse.each { | node | str = "open tag #{node.entity.name} is missing end tag" raise ParserError.new(str, @tokenizer) } end attr_reader :document # Returns true if the specified entity is a comment, processing # instruction, XMLDecl, or whitespace. def miscEntity?(entity) return entity.instance_of?(Comment) || entity.instance_of?(ProcessingInstruction) || entity.instance_of?(XMLDecl) || (entity.instance_of?(Text) && (entity.text() =~ /^\s*$/m)) end def handleTagStart(entity) parent = @nodeStack.last node = Node.new(entity, parent) if parent.nil? # If parent is nil, we are at top level. if !@document.rootNode.nil? str = "tag '#{entity.name}' seen after close of" + ' top-level tag; there can only be one top-level tag' raise ParserError.new(str, @tokenizer) end @document.setRootNode(node) else parent.children << node end @nodeStack.push(node) end def handleTagEnd(entity) # Make sure stack isn't empty if @nodeStack.empty? str = "end tag '#{entity.name}' without opening tag" raise ParserError.new(str, @tokenizer) end # Make sure this tag name matches popped tag lastSeen = @nodeStack.last if entity.name != lastSeen.entity.name str = "end tag '#{entity.name}' does not match" + " last-seen start tag named '#{lastSeen.entity.name}' " raise ParserError.new(str, @tokenizer) end @nodeStack.pop() # If this is the close of the root node, we are now in the # document's epilogue where only misc tags are allowed. if @nodeStack.empty? @documentSection = :DOCUMENT_EPILOGUE end end # Passes entity on to either handleTagEnd or handleTagStart. Not # too exciting, really. def handleTag(entity) if entity.isTagEnd handleTagEnd(entity) else handleTagStart(entity) end end # Handle entity based on which document section we are in and what # kind of entity it is. def handleNextEntity(entity) if @documentSection == :DOCUMENT_PROLOG isDocType = entity.instance_of?(Doctype) if !miscEntity?(entity) && !isDocType @documentSection = :DOCUMENT_BODY # ...continue processing this as a body tag else if isDocType && !@document.doctype.nil? raise ParserError.new("multiple DOCTYPE tags seen", @tokenizer) end @document.addToProlog(entity) return end end if @documentSection == :DOCUMENT_EPILOGUE if !miscEntity?(entity) str = "entity of type #{entity.class} seen after" + " document's root node" raise ParserError.new(str, @tokenizer) end @document.addToEpilogue(entity) return end # We are in the body of the document. if entity.instance_of?(Tag) handleTag(entity) return end # From here down, we have any entity except a Tag. if entity.instance_of?(Doctype) str = 'DOCTYPE seen in document prolog' raise ParserError.new(str, @tokenizer) end # Add this entity to parent. If parent is nil, then we have a # problem: the entity isn't a tag, therefore it can't be the # root node. parent = @nodeStack.last if parent.nil? str = "unexpected entity of type '#{entity.class}' seen" + " outside of root node" raise ParserError.new(str, @tokenizer) end parent.addChild(entity) end end end