######################################################################## # $Header: /var/local/cvsroot/4Suite/Ft/Xml/Lib/TreeCompare.py,v 1.26 2005/03/18 23:47:16 jkloth Exp $ """ Comparison functions for XML and HTML documents (mainly used in the test suites) Copyright 2005 Fourthought, Inc. (USA). Detailed license and copyright information: http://4suite.org/COPYRIGHT Project home, documentation, distributions: http://4suite.org/ """ import re, sgmllib from sgmllib import SGMLParser from xml.dom import Node # adds colon to regex patterns for names sgmllib.tagfind = re.compile('[a-zA-Z][-:_.a-zA-Z0-9]*') sgmllib.attrfind = re.compile(r'\s*([a-zA-Z_][-:_.a-zA-Z0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|\S*))?') from Ft.Lib.Uri import BASIC_RESOLVER, OsPathToUri from Ft.Xml import Domlette, InputSource, XMLNS_NAMESPACE from Ft.Xml.Lib.XmlString import XmlStrStrip, IsXmlSpace import HtmlPrinter FORBIDDEN_END_ELEMENTS = HtmlPrinter.HtmlPrinter.forbiddenEndElements.keys() del HtmlPrinter _S = "[\x20\x09\x0D\x0A]" _OptionalS = _S + "?" _VersionNum = "[a-zA-Z0-9_.:-]+" _Eq = "%s?=%s?" % (_S, _S) _VersionInfo = _S + "version" + _Eq + \ "(?:(?:'" + _VersionNum + "')|" + '(?:"' + _VersionNum + '"))' _EncName = "[A-Za-z][A-Za-z0-9._-]*" _EncodingDecl = _S + "encoding" + _Eq + \ "(?:(?:'" + _EncName + "')|" + '(?:"' + _EncName + '"))' _SDDecl = _S + "standalone" + _Eq + \ "(?:(?:'(?:yes|no)')|" + '(?:"(?:yes|no)"))' g_xmlTest = re.compile(r"<\?xml" + r"(?P%s)" % _VersionInfo + r"(?P%s)?" % _EncodingDecl + r"(?P%s)?" % _SDDecl + r"%s?\?>" % _S) g_doctypeTest = re.compile("(') def CompareHTML(html1, html2, ignoreWhitespace=0): """ A cmp()-like function that compares two HTML strings by parsing with sgmllib.SGMLParser and comparing events until a mismatch is found. It has the side effect of reporting differences to stdout. ignoreWhitespace controls whether whitespace differences in text events are ignored. """ # SGMLParser goes awry when tags are encountered. # Obviously this is a very kludgy 'solution' html1 = g_xmlEmptyTagPattern.sub(r'<\1>',html1) html2 = g_xmlEmptyTagPattern.sub(r'<\1>',html2) p1 = SGMLParserEventGenerator() p1.feed(html1) p1.close() p2 = SGMLParserEventGenerator() p2.feed(html2) p2.close() stack = [] #Not 100% accurate, but close enough for cur1, cur2 in zip(p1.events, p2.events): if cur1[0] != cur2[0]: #different events return __ReportEventError(cur1,cur2,stack,'different events') event = cur1[0] if event == SGMLParserEventGenerator.TEXT_EVENT: # Compare the text of each d1 = cur1[1] d2 = cur2[1] if ignoreWhitespace and XmlStrStrip(d1) != XmlStrStrip(d2): return __ReportEventError(cur1,cur2,stack,'data') elif event == SGMLParserEventGenerator.COMMENT_EVENT: d1 = cur1[1] d2 = cur2[1] if d1.strip() != d2.strip(): return __ReportEventError(cur1,cur2,stack,'comment data') elif event == SGMLParserEventGenerator.START_TAG_EVENT: if cur1[1] != cur2[1]: return __ReportEventError(cur1,cur2,stack,'start tag name') stack.append(cur1[1]) #Save for a nice print out att1 = cur1[2] att2 = cur2[2] if len(att1) != len(att2): return __ReportEventError(cur1,cur2,stack,'number of attributes') for name, value in att1.items(): if att2.get(name, -1) != value: return __ReportEventError(cur1,cur2,stack,'attribute value %s' % name) elif event == SGMLParserEventGenerator.END_TAG_EVENT: if cur1[1] != cur2[1]: return __ReportEventError(cur1,cur2,stack,'end tag name') while stack and stack[-1] != cur1[1]: #Remove it del stack[-1] del stack[-1] elif event == SGMLParserEventGenerator.ENTITYREF_EVENT: if cur1[1] != cur2[1]: return __ReportEventError(cur1,cur2,stack,'entity ref') elif event == SGMLParserEventGenerator.CHARREF_EVENT: if cur1[1] != cur2[1]: return __ReportEventError(cur1,cur2,stack,'char ref') else: raise cur1 return 1 def __ReportEventError(event1, event2, stack, attribute): __PrintStack(stack) print '--- Expected ---' print attribute, repr(event1[1:]) print '--- Compared ---' print attribute, repr(event2[1:]) return 0 def __PrintStack(stack): indent = '' for name in stack: print "%s%s" % (indent,name) indent += ' ' ## if isHtml: ## # HTML DOM should already capitalize all tagNames ## if node1.tagName != node2.tagName: ## return __ReportError(node1, node2, 'tagName') ## # Elements where whitespace is significant ## if node1.tagName in ['SCRIPT', 'STYLE', 'PRE', 'TEXTAREA']: ## ignoreWhitespace = 0 ## if isHtml: ## # HTML DOMs should force upper case already ## # FIXME: PyXML 0.7 changed HTML attributes to be NS so they ## # are no longer forced to uppercase! ## if attr1.name.upper() != attr2.name.upper(): ## return __ReportError(attr1, attr2, 'name') ## if ignoreWhitespace or isHtml: ## if not XmlStrStrip(text1): ## text1 = None ## if not XmlStrStrip(text2): ## text2 = None