######################################################################## # $Header: /var/local/cvsroot/4Suite/Ft/Xml/Lib/HtmlPrinter.py,v 1.13 2005/02/09 09:12:06 mbrown Exp $ """ This module supports document serialization in HTML syntax. Copyright 2005 Fourthought, Inc. (USA). Detailed license and copyright information: http://4suite.org/COPYRIGHT Project home, documentation, distributions: http://4suite.org/ """ import re from Ft.Xml import EMPTY_NAMESPACE import cStreamWriter from XmlPrinter import XmlPrinter class HtmlPrinter(XmlPrinter): """ An HtmlPrinter instance provides functions for serializing an XML or XML-like document to a stream, based on SAX-like event calls initiated by an instance of Ft.Xml.Lib.Print.PrintVisitor. The methods in this subclass of XmlPrinter attempt to emit a document conformant to the HTML 4.01 syntax, with no extra whitespace added for visual formatting. The degree of correctness of the output depends on the data supplied in the event calls; no checks are done for conditions that would result in syntax errors, such as two attributes with the same name, "--" in a comment, etc. """ def __init__(self, stream, encoding): """ Creates an HtmlPrinter instance. stream must be a file-like object open for writing binary data. encoding specifies the encoding which is to be used for writing to the stream. """ XmlPrinter.__init__(self, stream, encoding) self.disableOutputEscaping = 0 return def startDocument(self, version='4.0', standalone=None): """ Handles a startDocument event. Differs from the overridden method in that no XML declaration is written. """ # If the version isn't one we know how to handle, fallback to 4.0. if version not in self._versionedEntities: version = '4.0' # Set the entity maps to the particular version of HTML being output. self.textEntities, self.attrEntitiesQuot, self.attrEntitiesApos = \ self._versionedEntities[version] return def doctype(self, name, publicId, systemId): """ Handles a doctype event. Extends the overridden method by adding support for the case when there is a publicId and no systemId, which is allowed in HTML but not in XML. """ if publicId and not systemId: self.writeAscii('\n') else: XmlPrinter.doctype(self, name, publicId, systemId) return def startElement(self, namespaceUri, tagName, namespaces, attributes): """ Handles a startElement event. Extends the overridden method by disabling output escaping for the content of certain elements (SCRIPT and STYLE). """ if namespaceUri is not EMPTY_NAMESPACE: XmlPrinter.startElement(self, namespaceUri, tagName, namespaces, attributes) return if tagName.lower() in self.noEscapeElements: self.disableOutputEscaping += 1 XmlPrinter.startElement(self, namespaceUri, tagName, namespaces, attributes) # HTML tags are never in minimized form ('') self.writeAscii('>') self._inElement = False return def endElement(self, namespaceUri, tagName): """ Handles an endElement event. Differs from the overridden method in that an end tag is not generated for certain elements. """ if namespaceUri is not EMPTY_NAMESPACE: XmlPrinter.endElement(self, namespaceUri, tagName) return element = tagName.lower() if element not in self.forbiddenEndElements: self.writeAscii('') # Restore normal escaping if closing a no-escape element. if element in self.noEscapeElements: self.disableOutputEscaping -= 1 return def attribute(self, elementUri, elementName, name, value): """ Handles an attribute event. Extends the overridden method by writing boolean attributes in minimized form. """ if elementUri is not EMPTY_NAMESPACE: XmlPrinter.attribute(self, elementUri, elementName, name, value) return element = elementName.lower() attribute = name.lower() if element in self.booleanAttributes.get(attribute, []) \ and attribute == value.lower(): # A boolean attribute, just write out the name self.writeAscii(' ') self.writeEncode(name, 'attribute name') elif element in self.uriAttributes.get(attribute, []): # From HTML 4.0 Section B.2.1 # We recommend that user agents adopt the following convention for # handling non-ASCII characters: # 1. Represent each character in UTF-8 (see [RFC2279]) as one or # more bytes. # 2. Escape these bytes with the URI escaping mechanism (i.e., by # converting each byte to %HH, where HH is the hexadecimal # notation of the byte value). # (Although this recommendation is for HTML user agents # that encounter HTML with improperly escaped URI refs, # we implement it in order to comply with XSLT's html # output method, and because there's no compelling reason # not to do it for non-XSLT serializations as well) # # FIXME: # "&" should not be escaped in an attribute value when it # it is followed by "{" (see Section B.7.1 of HTML 4.0). value = unicode(re.sub('[\x80-\xff]', lambda match: '%%%02X' % ord(match.group()), value.encode('UTF-8'))) XmlPrinter.attribute(self, elementUri, elementName, name, value) else: XmlPrinter.attribute(self, elementUri, elementName, name, value) return def text(self, data, disableEscaping=0): """ Handles a text event. Extends the overridden method by disabling output escaping if in the content of certain elements like SCRIPT or STYLE. """ if self._inElement: self.writeAscii('>') self._inElement = False disableEscaping = disableEscaping or self.disableOutputEscaping XmlPrinter.text(self, data, disableEscaping) return def processingInstruction(self, target, data): """ Handles a processingInstruction event. Differs from the overridden method by writing the tag with no "?" at the end. """ if self._inElement: self.writeAscii('>') self._inElement = False self.writeAscii('') return # Elements for which end tags must not be emitted forbiddenEndElements = {} for name in ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param']: forbiddenEndElements[name] = True del name # Elements in which character data is not escaped # # FIXME: According to HTML 4.01 section B.3.2, "' : '>', '&' : '&', '\r' : ' ', } textEntities.update(entities_3_2) _versionedEntities['3.2'].append(cStreamWriter.EntityMap(textEntities)) textEntities.update(entities_4_0) textEntities = cStreamWriter.EntityMap(textEntities) _versionedEntities['4.0'].append(textEntities) # For HTML attribute values: # 1. do not escape '<' (see XSLT 1.0 section 16.2) # 2. only escape '&' if not followed by '{' def attr_amp_escape(string, offset): if string.startswith('&{', offset): return '&' else: return '&' attrEntitiesQuot = {'&' : attr_amp_escape, '\t' : ' ', '\n' : ' ', '\r' : ' ', '"' : '"', } attrEntitiesQuot.update(entities_3_2) _versionedEntities['3.2'].append(cStreamWriter.EntityMap(attrEntitiesQuot)) attrEntitiesQuot.update(entities_4_0) attrEntitiesQuot = cStreamWriter.EntityMap(attrEntitiesQuot) _versionedEntities['4.0'].append(attrEntitiesQuot) attrEntitiesApos = {'&' : attr_amp_escape, '\t' : ' ', '\n' : ' ', '\r' : ' ', "'" : ''', # no ' in HTML } attrEntitiesApos.update(entities_3_2) _versionedEntities['3.2'].append(cStreamWriter.EntityMap(attrEntitiesApos)) attrEntitiesApos.update(entities_4_0) attrEntitiesApos = cStreamWriter.EntityMap(attrEntitiesApos) _versionedEntities['4.0'].append(attrEntitiesApos) del entities_3_2 del entities_4_0 del attr_amp_escape