######################################################################## # $Header: /var/local/cvsroot/4Suite/Ft/Xml/Lib/XmlPrinter.py,v 1.16 2006/04/12 21:06:10 uogbuji Exp $ """ This module supports document serialization in XML syntax. Copyright 2005 Fourthought, Inc. (USA). Detailed license and copyright information: http://4suite.org/COPYRIGHT Project home, documentation, distributions: http://4suite.org/ """ import cStreamWriter class XmlPrinter: """ An XmlPrinter instance provides functions for serializing an XML or XML-like document to a stream, based on SAX-like event calls initiated by an Ft.Xml.Lib.Print.PrintVisitor instance. The methods in this base class attempt to emit a well-formed parsed general entity conformant to XML 1.0 syntax, with no extra whitespace added for visual formatting. Subclasses may emit documents conformant to other syntax specifications or with additional whitespace for indenting. The degree of well-formedness of the output depends on the data supplied in the event calls; no checks are done for conditions that would result in syntax errors, such as two attributes with the same name, "--" in a comment, etc. However, attribute() will do nothing if the previous event was not startElement(), thus preventing spurious attribute serializations. """ def __init__(self, stream, encoding): """ stream must be a file-like object open for writing binary data. encoding specifies the encoding which is to be used for writing to the stream. """ self.stream = sw = cStreamWriter.StreamWriter(stream, encoding) self.encoding = encoding self.writeAscii = sw.writeAscii self.writeEncode = sw.writeEncode self.writeEscape = sw.writeEscape self.reset() return def reset(self): "Sets the writer state as if it were a brand new instance" self._inElement = False return def startDocument(self, version='1.0', standalone=None): """ Handles a startDocument event. Writes XML declaration or text declaration to the stream. """ self.writeAscii('\n') return def endDocument(self): """ Handles an endDocument event. Writes any necessary final output to the stream. """ if self._inElement: # No element content, use minimized form self.writeAscii('/>') return def doctype(self, name, publicId, systemId): """ Handles a doctype event. Writes a document type declaration to the stream. """ if self._inElement: self.writeAscii('>') self._inElement = False if publicId and systemId: self.writeAscii('\n') elif systemId: self.writeAscii('\n') return def startElement(self, namespaceUri, tagName, namespaces, attributes): """ Handles a startElement event. Writes part of an element's start-tag or empty-element tag to the stream, and closes the start tag of the previous element, if one remained open. Writes the xmlns attributes for the given dictionary of namespaces, and invokes attribute() as neeeded to write the given dictionary of attributes. The namespaceUri argument is ignored in this class. """ if self._inElement: # Close current start tag self.writeAscii('>') else: self._inElement = True self.writeAscii('<') self.writeEncode(tagName, 'start-tag name') # Write the namespaces for prefix, uri in namespaces.items(): if prefix: self.attribute(namespaceUri, tagName, u"xmlns:"+prefix, uri) else: self.attribute(namespaceUri, tagName, u"xmlns", uri) # Now the attributes for name, value in attributes.items(): self.attribute(namespaceUri, tagName, name, value) return def endElement(self, namespaceUri, tagName): """ Handles an endElement event. Writes the closing tag for an element to the stream, or, if the element had no content, finishes writing the empty element tag. The namespaceUri argument is ignored in this class. """ if self._inElement: # No element content, use minimized form self.writeAscii('/>') self._inElement = False else: self.writeAscii('') return # elementUri and elementName are only needed for HTML output def attribute(self, elementUri, elementName, name, value): """ Handles an attribute event. Writes an attribute to the stream as a space followed by the name, '=', and quote-delimited value. It is the caller's responsibility to ensure that this is called in the correct context, if well-formed output is desired. Preference is given to quotes (\") around attribute values, in accordance with the DomWriter interface in DOM Level 3 Load and Save (25 July 2002 WD), although a value that contains quotes but no apostrophes will be delimited by apostrophes (') instead. The elementName arguments are not used by default, but may be used by subclasses. """ self.writeAscii(" ") self.writeEncode(name, 'attribute name') self.writeAscii("=") # Replace characters illegal in attribute values # Wrap the value with appropriate quoting in accordance with # DOM Level 3 Load and Save: # 1. Attributes not containing quotes are serialized in quotes. # 2. Attributes containing quotes but no apostrophes are serialized # in apostrophes. # 3. Attributes containing both forms of quotes are serialized in # quotes, with quotes within the value represented by the # predefined entity ". if (u'"' in value) and (u"'" not in value): # Use apostrophes (#2) entitymap = self.attrEntitiesApos quote = "'" else: # Use quotes (#1 and #3) entitymap = self.attrEntitiesQuot quote = '"' self.writeAscii(quote) self.writeEscape(value, entitymap) self.writeAscii(quote) return def text(self, text, disableEscaping=0): """ Handles a text event. Writes character data to the stream. If the disableEscaping flag is not set, then unencodable characters are replaced with numeric character references; "&" and "<" are escaped as "&" and "<"; and ">" is escaped as ">" if it is preceded by "]]". If the disableEscaping flag is set, then the characters are written to the stream with no escaping of any kind, which will result in an exception if there are unencodable characters. """ if self._inElement: self.writeAscii('>') self._inElement = False if disableEscaping: # Try to write the raw encoded string (may throw exception) self.writeEncode(text, 'text') else: # FIXME: only escape ">" if after "]]" # (may not be worth the trouble) self.writeEscape(text, self.textEntities) return def cdataSection(self, data): """ Handles a cdataSection event. Writes character data to the stream as a CDATA section. """ if self._inElement: self.writeAscii('>') self._inElement = False sections = data.split(u']]>') self.writeAscii('') self.writeEncode(section, 'CDATA section') self.writeAscii(']]>') return def processingInstruction(self, target, data): """ Handles a processingInstruction event. Writes a processing instruction to the stream. """ if self._inElement: self.writeAscii('>') self._inElement = False self.writeAscii('') return def comment(self, data): """ Handles a comment event. Writes a comment to the stream. """ if self._inElement: self.writeAscii('>') self._inElement = False self.writeAscii("") return # Entities as defined by Canonical XML 1.0 (http://www.w3.org/TR/xml-c14n) # For XML 1.1, add u'\u0085': '…' and u'\u2028': '
' to all textEntities = cStreamWriter.EntityMap({'<' : '<', '>' : '>', '&' : '&', '\r' : ' ', }) attrEntitiesQuot = cStreamWriter.EntityMap({'<' : '<', '&' : '&', '\t' : ' ', '\n' : ' ', '\r' : ' ', '"' : '"', }) attrEntitiesApos = cStreamWriter.EntityMap({'<' : '<', '&' : '&', '\t' : ' ', '\n' : ' ', '\r' : ' ', "'" : ''', }) import re from Ft.Xml import SplitQName from Ft.Xml import XMLNS_NAMESPACE class CanonicalXmlPrinter(XmlPrinter): """ CanonicalXmlPrinter emits only c14n XML. http://www.ibm.com/developerworks/xml/library/x-c14n/ http://www.w3.org/TR/xml-c14n Does not yet: * Normalize all attribute values * Specify all default attributes Note: this class is fully compatible with exclusive c14n: http://www.w3.org/TR/xml-exc-c14n/ whether or not the operation is exclusive depends on preprocessing operations appplied by the caller. See Ft.Xml.Lib.Print, for example """ #Use the hex form of character escaping, according to c14n textEntities = cStreamWriter.EntityMap({'<' : '<', '>' : '>', '&' : '&', '\r' : ' ', }) attrEntitiesQuot = cStreamWriter.EntityMap({'<' : '<', '&' : '&', '\t' : ' ', '\n' : ' ', '\r' : ' ', '"' : '"', }) attrEntitiesApos = cStreamWriter.EntityMap({'<' : '<', '&' : '&', '\t' : ' ', '\n' : ' ', '\r' : ' ', "'" : ''', }) attrNormPattern = re.compile(u'( | | )+') def __init__(self, stream, encoding=None): __doc__ = XmlPrinter.__init__.__doc__ __doc__ += """ Warning: the encoding parameter is ignored """ #Encoding hard-wired, as required XmlPrinter.__init__(self, stream, 'UTF-8') self.reset() return def reset(self): "Sets the writer state as if it were a brand new instance" self._ns_stack = [{None: u'', u'xml': XMLNS_NAMESPACE}] XmlPrinter.reset(self) return def startDocument(self, version='1.0', standalone=None): """ No XML declaration is generated """ #Have to keep track of stack of in-scope NSS for c14n, primarily #for attribute ordering (which is really silly of the C14N spec, #if you ask me) self._ns_stack = [{None: u'', u'xml': XMLNS_NAMESPACE}] return def endDocument(self): """ Writes any necessary final output to the stream. """ if self._inElement: # No element content, but never use minimized form in c14n self.writeAscii('') return def doctype(self, name, publicId, systemId): """ Handles a doctype event. No output in c14n. """ return def startElement(self, namespaceUri, tagName, namespaces, attributes): """ Handles a startElement event. Writes part of an element's start-tag or empty-element tag to the stream, and closes the start tag of the previous element, if one remained open. Writes the xmlns attributes for the given dictionary of namespaces, and invokes attribute() as neeeded to write the given dictionary of attributes. The namespaceUri argument is ignored in this class. """ if self._inElement: # Close current start tag self.writeAscii('>') else: self._inElement = True self.writeAscii('<') self.writeEncode(tagName, 'start-tag name') # Write the namespaces, in alphabetical order of prefixes, with # the default coming first (easy since None comes before any actual # Unicode value) prefixes = namespaces.items() prefixes.sort() parent_prefixes = self._ns_stack[-1].items() for prefix, uri in prefixes: #No redundant NSDecls if (prefix, uri) not in parent_prefixes: if prefix: self.attribute(namespaceUri, tagName, u"xmlns:"+prefix, uri) else: self.attribute(namespaceUri, tagName, u"xmlns", uri) self._ns_stack.append(self._ns_stack[-1].copy()) self._ns_stack[-1].update(namespaces) # Now the attributes attrs = attributes.items() attrs = [ (self._ns_stack[-1].get(SplitQName(name)[0], None), name, value) for name, value in attributes.items() ] attrs.sort() for ns, name, value in attrs: self.attribute(namespaceUri, tagName, name, value) return def endElement(self, namespaceUri, tagName): """ Handles an endElement event. Writes the closing tag for an element to the stream. The namespaceUri argument is ignored in this class. """ del self._ns_stack[-1] if self._inElement: self.writeAscii('>') self._inElement = False self.writeAscii('') return # elementUri and elementName are only needed for HTML output, and thus not at all for c14n def attribute(self, elementUri, elementName, name, value): """ Handles an attribute event. Writes an attribute to the stream as a space followed by the name, '=', and quote-delimited value. It is the caller's responsibility to ensure that this is called in the correct context, if well-formed output is desired. The delimiter is always a quote (\"), as required by c14n The elementName arguments are not used by default, but may be used by subclasses. """ self.writeAscii(" ") self.writeEncode(name, 'attribute name') self.writeAscii("=") entitymap = self.attrEntitiesQuot quote = '"' self.writeAscii(quote) #This is the right normalization for type NMTOKEN, ID, etc. #but we can't assume it's not just plain old CDATA #value = self.attrNormPattern.subn(u' ', value.strip())[0] self.writeEscape(value, entitymap) self.writeAscii(quote) return def text(self, text, disableEscaping=0): """ Handles a text event. Writes character data to the stream. All characters should be suitable for encoding (UTF-8 only); "&" and "<" are escaped as "&" and "<"; and ">" is escaped as ">" if it is preceded by "]]". disableEscaping is ignored. """ if self._inElement: self.writeAscii('>') self._inElement = False text.replace(u'\r\n', u'\r') # FIXME: only escape ">" if after "]]" # (may not be worth the trouble) self.writeEscape(text, self.textEntities) return def cdataSection(self, data): """ Handles a cdataSection event. No CDATA sections in c14n, so just commute to the text event """ self.text(data) return def processingInstruction(self, target, data): """ Handles a processingInstruction event. Writes a processing instruction to the stream. """ if self._inElement: self.writeAscii('>') self._inElement = False self.writeAscii('') return def comment(self, data): """ Handles a comment event. Writes a comment to the stream. """ if self._inElement: self.writeAscii('>') self._inElement = False self.writeAscii("") return