######################################################################## # $Header: /var/local/cvsroot/4Suite/Ft/Xml/InputSource.py,v 1.49 2005/09/15 00:06:15 jkloth Exp $ """ Classes providing a standard interface and encapsulation of metadata for document/entity streams intended for input to various XML processors. Copyright 2005 Fourthought, Inc. (USA). Detailed license and copyright information: http://4suite.org/COPYRIGHT Project home, documentation, distributions: http://4suite.org/ """ import os, cStringIO, types, warnings, mimetools from Ft import FtWarning from Ft.Lib import Uri, Uuid __all__ = ['InputSource', 'NullInputSource', 'InputSourceFactory', 'DefaultFactory', 'NoCatalogFactory'] # Methods an InputSource instance should expose from its underly stream. _file_methods = ('read', 'readline', 'readlines', 'close') class InputSource: """ An input source is an encapsulation of a source of content. It includes a stream (Python file-like object) from which the content can be read, a URI to identify the stream and facilitate resolution of relative URI references / system IDs encountered within the stream, and parameters used by the processors of the stream (XML parsers, XSLT processors). It is designed to be overridden as applications need different functionality from sources. """ RESOLVE_ENTITY_HINT = "EXTERNAL ENTITY" RESOLVE_URI_HINT = "RESOLVE URI" CATALOG_URI_HINT = "CATALOG URI" def __init__(self, stream, uri=None, processIncludes=True, stripElements=None, factory=None, resolver=Uri.BASIC_RESOLVER, catalog=None, encoding=None): """ InputSource constructor source = InputSource(...) stream - the stream associated with this input source uri - the absolute URI of the input source processIncludes - Whether or not XIncludes should be expanded stripElements - Space stripping rules factory - The factory that created this instance resolver - URI resolver; defaults to Ft.Lib.Uri.BASIC_RESOLVER catalog - TR9401/XML Catalog object for resolving public IDs encoding - a string externally declaring the stream's encoding """ if uri: self.uri = uri else: self.uri = 'urn:uuid:' + Uuid.UuidAsString(Uuid.GenerateUuid()) self.stream = stream self.processIncludes = processIncludes self.stripElements = stripElements or [] self.factory = factory self.fragment = Uri.SplitFragment(self.uri)[1] self._resolver = resolver self._catalog = catalog enc = self._getStreamEncoding(stream) if enc is None: enc = encoding self.encoding = enc self.name = self.uri for name in _file_methods: method = getattr(stream, name, None) if method: setattr(self, name, method) return def _getStreamEncoding(self, stream): """ Returns the encoding of the given stream, if this info can be determined from metadata in the stream object with a reasonable degree of confidence. Adheres to RFC 3023, which requires the the charset value in the Content-Type header to take precedence, or if no value is available, to assume us-ascii in the case of certain text/* media types. For other text/* media types, adheres to RFC 2616 sec. 3.7.1, which requires the assumption of iso-8859-1, when the entity was transmitted by HTTP. Media type and charset info is ignored for streams believed to originate from a local file, in accordance with XML 1.0 Third Edition appendix F.2. """ # We should never try to deduce the encoding when the stream is # a local file, in order to conform with XML 1.0 Third Edition # appendix F.2, and also because urllib.urlopen() uses # mimetypes.guess_type() to set the media type on both local # files and FTP resources, thus causing '*.xml' files to tend to # get a 'text/xml' mapping, which is bad because RFC 3023 # requires them to be assumed to be us-ascii. Therefore, we must # look for clues that assure us that the stream is not likely to # be wrapping a file or FTP resource. The way to tell is to look # for the 'url' attribute on the stream object. urllib.urlopen() # MAY create this attribute and set it to the URL that was # passed in. Note that this 'URL' have just been a local # filesystem path or partial URL or junk like 'C:/x/y/z' stream_url = getattr(stream, 'url', None) if stream_url is None: return None scheme = Uri.GetScheme(stream_url) if scheme is None or scheme.lower() in ('file', 'ftp') \ or len(scheme) == 1: return None # Get the stream metadata. # Streams created by urllib.urlopen() MAY have an info() method # that MAY return a mimetools.Message object. We can trust this # as a source of metadata since we have already ruled out the # likelihood of it being a local file or FTP resource. info = None if hasattr(self.stream, 'info'): if isinstance(self.stream.info, types.MethodType): info = self.stream.info() if isinstance(info, mimetools.Message): # use explicit charset if present and not empty string. charset = info.getparam('charset') if charset: return charset # charset empty or not present, so examine media type # and protocol. maintype = getattr(info, 'maintype', None) subtype = getattr(info, 'subtype', None) if maintype == 'text': if subtype == 'xml' or \ subtype == 'xml-external-parsed-entity' or \ subtype.endswith('+xml'): return 'us-ascii' elif scheme == 'http': return 'iso-8859-1' # If we reach this point, the stream metadata was of no use, # so we'll let the parser determine the encoding from # the entity itself. return None def resolveEntity(self, publicId, systemId): """ Resolve an external entity to a new InputSource. Presented with an optional public identifier and a system identifier, this function attempts to locate a mapping in the catalog, if one is defined. If no mapping is found, the system identifier will be dereferenced as a URL. """ hint = InputSource.RESOLVE_ENTITY_HINT if self._catalog: new_uri = self._catalog.resolveEntity(publicId, systemId) if new_uri is not None: systemId = new_uri hint = InputSource.CATALOG_URI_HINT return self._resolve(systemId, None, hint) def resolve(self, uri, base=None, hint=None): """ Resolve a URI reference into a new InputSource. This function is used when a URI reference is encountered in the original stream and needs to be resolved (e.g. xi:include, xsl:include, xsl:import, document(), etc.). When a catalog is available, its URI entries are used first. If no entry is found, the URI is resolved against the current URI and then opened. The hint parameter is used to give a hint as to what the resolution will be used for. If the ignoreErrors flag is set, an error during resolution (such as "file not found") will result in None's being returned, rather than a raised exception. """ if self._catalog: new_uri = self._catalog.resolveURI(uri) if new_uri is not None: uri = new_uri hint = InputSource.CATALOG_URI_HINT if hint is None: hint = InputSource.RESOLVE_URI_HINT return self._resolve(uri, base, hint) def getUriResolver(self): """ This method returns the URI resolver that is used by this input source to normalize (resolve to absolute form) and resolve (dereference) URI references. This is the public method to use if just URI resolution is needed. """ return self._resolver #Helper Methods def _resolve(self, uri, base, hint, ignoreErrors=False): """ Resolves a system identifier (fragmentless URI reference) into a new input source. The hint parameter is used to give a hint as to what the resolution will be used for. If the ignoreErrors flag is set, an error during resolution (such as "file not found") will result in None's being returned, rather than a raised exception. """ uri = self._normalize(uri, base) stream = self._openStream(uri, ignoreErrors, hint) return self.clone(stream, uri, hint) def _normalize(self, uriref, base=None): """ Normalize (resolve to absolute form) a given URI reference, using the URI of this input source as the base. The default implementation will just use the default URI resolver. If your input source is working with non-standard or not supported URIs, then you will need to override this or the getUriResolver method. """ if base is None: base = self.uri return self.getUriResolver().normalize(uriref, base) def _openStream(self, uri, ignoreErrors=False, hint=None): """ Returns a representation of a resource as a stream by resolving the given URI. If ignoreErrors is set, failure to obtain the stream will result in None being returned, rather than an exception (e.g. "file not found") being raised. Default behaviour is to use the resolver associated with this InputSource. If your custom InputSource needs to open URIs that are not supported natively by this InputSource (e.g., repository objects, or objects from a database), then you should override this method and do whatever it takes to resolve the URI into a readable stream. """ try: return self.getUriResolver().resolve(uri) except: if ignoreErrors: return None raise def clone(self, stream, uri=None, hint=None): """ Clones this input source, creating a new instance with the known params. If your derived InputSource requires additional state information then you have to override how it is cloned and pickled. """ if uri is None: uri = self.uri if stream is None: return NullInputSource(uri) if hint is not None: # don't inherit encoding when cloning for self.resolve() encoding = None else: encoding = self.encoding return self.__class__(stream, uri, processIncludes=self.processIncludes, stripElements=self.stripElements, factory=self.factory, resolver=self._resolver, catalog=self._catalog, encoding=encoding) #Pickle routines. We need to be able to pickle an input source #but cannot pickle a stream def __getstate__(self): state = self.__dict__.copy() state['stream'] = None return state class NullInputSource(InputSource): """ An InputSource that simulates an empty stream. """ def __init__(self, uri=None): InputSource.__init__(self, cStringIO.StringIO(), uri) class InputSourceFactory: """ A factory for creating new InputSource instances. """ FACTORY_URI_HINT = 'FACTORY URI' def __init__(self, inputSourceClass=None, resolver=Uri.BASIC_RESOLVER, catalog=None): self._klass = inputSourceClass or InputSource self.resolver = resolver self.catalog = catalog return def fromUri(self, uri, *v_args, **kw_args): """ Creates an InputSource from the stream resulting from the resolution of the given URI. uri - a URI from which the input will be read. Important: a file path is generally not a URI. To be safe, if you wish to read from a file, use the following pattern: from Ft.Lib import Uri uri = Uri.OsPathToUri("/path/to/file.ext") OR uri = Uri.OsPathToUri("C:\\path\\to\\file.ext") """ hint = InputSourceFactory.FACTORY_URI_HINT if self.catalog: new_uri = self.catalog.resolveURI(uri) if new_uri is not None: uri = new_uri hint = InputSource.CATALOG_URI_HINT src = self.fromStream(None, uri, *v_args, **kw_args) return src._resolve(uri, None, hint) def fromString(self, st, uri=None, *v_args, **kw_args): """ Creates an InputSource from a stream derived from the given string. The uri argument is the URI to use for the stream (one should always be given, even if it's bogus). """ if not isinstance(st, str): raise ValueError("String must be of type string, not %s" % (st is None and 'None' or type(st).__name__)) stream = cStringIO.StringIO(st) return self.fromStream(stream, uri, *v_args, **kw_args) def fromStream(self, stream, uri=None, *v_args, **kw_args): """ Creates an InputSource from the given stream. The uri argument is the URI to use for the stream (one should always be given, even if it's bogus). """ if not uri: warnings.warn("Creation of InputSource without a URI", FtWarning, 2) kw_args['factory'] = self if 'resolver' not in kw_args: kw_args['resolver'] = self.resolver if 'catalog' not in kw_args: kw_args['catalog'] = self.catalog return self._klass(stream, uri, *v_args, **kw_args) NoCatalogFactory = InputSourceFactory(catalog=None) from Ft.Xml.Catalog import GetDefaultCatalog DefaultFactory = InputSourceFactory(catalog=GetDefaultCatalog())