PyXR

c:\python24\lib \ xml \ dom \ expatbuilder.py



0001 """Facility to use the Expat parser to load a minidom instance
0002 from a string or file.
0003 
0004 This avoids all the overhead of SAX and pulldom to gain performance.
0005 """
0006 
0007 # Warning!
0008 #
0009 # This module is tightly bound to the implementation details of the
0010 # minidom DOM and can't be used with other DOM implementations.  This
0011 # is due, in part, to a lack of appropriate methods in the DOM (there is
0012 # no way to create Entity and Notation nodes via the DOM Level 2
0013 # interface), and for performance.  The later is the cause of some fairly
0014 # cryptic code.
0015 #
0016 # Performance hacks:
0017 #
0018 #   -  .character_data_handler() has an extra case in which continuing
0019 #      data is appended to an existing Text node; this can be a
0020 #      speedup since pyexpat can break up character data into multiple
0021 #      callbacks even though we set the buffer_text attribute on the
0022 #      parser.  This also gives us the advantage that we don't need a
0023 #      separate normalization pass.
0024 #
0025 #   -  Determining that a node exists is done using an identity comparison
0026 #      with None rather than a truth test; this avoids searching for and
0027 #      calling any methods on the node object if it exists.  (A rather
0028 #      nice speedup is achieved this way as well!)
0029 
0030 from xml.dom import xmlbuilder, minidom, Node
0031 from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
0032 from xml.parsers import expat
0033 from xml.dom.minidom import _append_child, _set_attribute_node
0034 from xml.dom.NodeFilter import NodeFilter
0035 
0036 from xml.dom.minicompat import *
0037 
0038 TEXT_NODE = Node.TEXT_NODE
0039 CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
0040 DOCUMENT_NODE = Node.DOCUMENT_NODE
0041 
0042 FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
0043 FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
0044 FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
0045 FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
0046 
0047 theDOMImplementation = minidom.getDOMImplementation()
0048 
0049 # Expat typename -> TypeInfo
0050 _typeinfo_map = {
0051     "CDATA":    minidom.TypeInfo(None, "cdata"),
0052     "ENUM":     minidom.TypeInfo(None, "enumeration"),
0053     "ENTITY":   minidom.TypeInfo(None, "entity"),
0054     "ENTITIES": minidom.TypeInfo(None, "entities"),
0055     "ID":       minidom.TypeInfo(None, "id"),
0056     "IDREF":    minidom.TypeInfo(None, "idref"),
0057     "IDREFS":   minidom.TypeInfo(None, "idrefs"),
0058     "NMTOKEN":  minidom.TypeInfo(None, "nmtoken"),
0059     "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
0060     }
0061 
0062 class ElementInfo(NewStyle):
0063     __slots__ = '_attr_info', '_model', 'tagName'
0064 
0065     def __init__(self, tagName, model=None):
0066         self.tagName = tagName
0067         self._attr_info = []
0068         self._model = model
0069 
0070     def __getstate__(self):
0071         return self._attr_info, self._model, self.tagName
0072 
0073     def __setstate__(self, state):
0074         self._attr_info, self._model, self.tagName = state
0075 
0076     def getAttributeType(self, aname):
0077         for info in self._attr_info:
0078             if info[1] == aname:
0079                 t = info[-2]
0080                 if t[0] == "(":
0081                     return _typeinfo_map["ENUM"]
0082                 else:
0083                     return _typeinfo_map[info[-2]]
0084         return minidom._no_type
0085 
0086     def getAttributeTypeNS(self, namespaceURI, localName):
0087         return minidom._no_type
0088 
0089     def isElementContent(self):
0090         if self._model:
0091             type = self._model[0]
0092             return type not in (expat.model.XML_CTYPE_ANY,
0093                                 expat.model.XML_CTYPE_MIXED)
0094         else:
0095             return False
0096 
0097     def isEmpty(self):
0098         if self._model:
0099             return self._model[0] == expat.model.XML_CTYPE_EMPTY
0100         else:
0101             return False
0102 
0103     def isId(self, aname):
0104         for info in self._attr_info:
0105             if info[1] == aname:
0106                 return info[-2] == "ID"
0107         return False
0108 
0109     def isIdNS(self, euri, ename, auri, aname):
0110         # not sure this is meaningful
0111         return self.isId((auri, aname))
0112 
0113 def _intern(builder, s):
0114     return builder._intern_setdefault(s, s)
0115 
0116 def _parse_ns_name(builder, name):
0117     assert ' ' in name
0118     parts = name.split(' ')
0119     intern = builder._intern_setdefault
0120     if len(parts) == 3:
0121         uri, localname, prefix = parts
0122         prefix = intern(prefix, prefix)
0123         qname = "%s:%s" % (prefix, localname)
0124         qname = intern(qname, qname)
0125         localname = intern(localname, localname)
0126     else:
0127         uri, localname = parts
0128         prefix = EMPTY_PREFIX
0129         qname = localname = intern(localname, localname)
0130     return intern(uri, uri), localname, prefix, qname
0131 
0132 
0133 class ExpatBuilder:
0134     """Document builder that uses Expat to build a ParsedXML.DOM document
0135     instance."""
0136 
0137     def __init__(self, options=None):
0138         if options is None:
0139             options = xmlbuilder.Options()
0140         self._options = options
0141         if self._options.filter is not None:
0142             self._filter = FilterVisibilityController(self._options.filter)
0143         else:
0144             self._filter = None
0145             # This *really* doesn't do anything in this case, so
0146             # override it with something fast & minimal.
0147             self._finish_start_element = id
0148         self._parser = None
0149         self.reset()
0150 
0151     def createParser(self):
0152         """Create a new parser object."""
0153         return expat.ParserCreate()
0154 
0155     def getParser(self):
0156         """Return the parser object, creating a new one if needed."""
0157         if not self._parser:
0158             self._parser = self.createParser()
0159             self._intern_setdefault = self._parser.intern.setdefault
0160             self._parser.buffer_text = True
0161             self._parser.ordered_attributes = True
0162             self._parser.specified_attributes = True
0163             self.install(self._parser)
0164         return self._parser
0165 
0166     def reset(self):
0167         """Free all data structures used during DOM construction."""
0168         self.document = theDOMImplementation.createDocument(
0169             EMPTY_NAMESPACE, None, None)
0170         self.curNode = self.document
0171         self._elem_info = self.document._elem_info
0172         self._cdata = False
0173 
0174     def install(self, parser):
0175         """Install the callbacks needed to build the DOM into the parser."""
0176         # This creates circular references!
0177         parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
0178         parser.StartElementHandler = self.first_element_handler
0179         parser.EndElementHandler = self.end_element_handler
0180         parser.ProcessingInstructionHandler = self.pi_handler
0181         if self._options.entities:
0182             parser.EntityDeclHandler = self.entity_decl_handler
0183         parser.NotationDeclHandler = self.notation_decl_handler
0184         if self._options.comments:
0185             parser.CommentHandler = self.comment_handler
0186         if self._options.cdata_sections:
0187             parser.StartCdataSectionHandler = self.start_cdata_section_handler
0188             parser.EndCdataSectionHandler = self.end_cdata_section_handler
0189             parser.CharacterDataHandler = self.character_data_handler_cdata
0190         else:
0191             parser.CharacterDataHandler = self.character_data_handler
0192         parser.ExternalEntityRefHandler = self.external_entity_ref_handler
0193         parser.XmlDeclHandler = self.xml_decl_handler
0194         parser.ElementDeclHandler = self.element_decl_handler
0195         parser.AttlistDeclHandler = self.attlist_decl_handler
0196 
0197     def parseFile(self, file):
0198         """Parse a document from a file object, returning the document
0199         node."""
0200         parser = self.getParser()
0201         first_buffer = True
0202         try:
0203             while 1:
0204                 buffer = file.read(16*1024)
0205                 if not buffer:
0206                     break
0207                 parser.Parse(buffer, 0)
0208                 if first_buffer and self.document.documentElement:
0209                     self._setup_subset(buffer)
0210                 first_buffer = False
0211             parser.Parse("", True)
0212         except ParseEscape:
0213             pass
0214         doc = self.document
0215         self.reset()
0216         self._parser = None
0217         return doc
0218 
0219     def parseString(self, string):
0220         """Parse a document from a string, returning the document node."""
0221         parser = self.getParser()
0222         try:
0223             parser.Parse(string, True)
0224             self._setup_subset(string)
0225         except ParseEscape:
0226             pass
0227         doc = self.document
0228         self.reset()
0229         self._parser = None
0230         return doc
0231 
0232     def _setup_subset(self, buffer):
0233         """Load the internal subset if there might be one."""
0234         if self.document.doctype:
0235             extractor = InternalSubsetExtractor()
0236             extractor.parseString(buffer)
0237             subset = extractor.getSubset()
0238             self.document.doctype.internalSubset = subset
0239 
0240     def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
0241                                    has_internal_subset):
0242         doctype = self.document.implementation.createDocumentType(
0243             doctypeName, publicId, systemId)
0244         doctype.ownerDocument = self.document
0245         self.document.childNodes.append(doctype)
0246         self.document.doctype = doctype
0247         if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
0248             self.document.doctype = None
0249             del self.document.childNodes[-1]
0250             doctype = None
0251             self._parser.EntityDeclHandler = None
0252             self._parser.NotationDeclHandler = None
0253         if has_internal_subset:
0254             if doctype is not None:
0255                 doctype.entities._seq = []
0256                 doctype.notations._seq = []
0257             self._parser.CommentHandler = None
0258             self._parser.ProcessingInstructionHandler = None
0259             self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
0260 
0261     def end_doctype_decl_handler(self):
0262         if self._options.comments:
0263             self._parser.CommentHandler = self.comment_handler
0264         self._parser.ProcessingInstructionHandler = self.pi_handler
0265         if not (self._elem_info or self._filter):
0266             self._finish_end_element = id
0267 
0268     def pi_handler(self, target, data):
0269         node = self.document.createProcessingInstruction(target, data)
0270         _append_child(self.curNode, node)
0271         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
0272             self.curNode.removeChild(node)
0273 
0274     def character_data_handler_cdata(self, data):
0275         childNodes = self.curNode.childNodes
0276         if self._cdata:
0277             if (  self._cdata_continue
0278                   and childNodes[-1].nodeType == CDATA_SECTION_NODE):
0279                 childNodes[-1].appendData(data)
0280                 return
0281             node = self.document.createCDATASection(data)
0282             self._cdata_continue = True
0283         elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
0284             node = childNodes[-1]
0285             value = node.data + data
0286             d = node.__dict__
0287             d['data'] = d['nodeValue'] = value
0288             return
0289         else:
0290             node = minidom.Text()
0291             d = node.__dict__
0292             d['data'] = d['nodeValue'] = data
0293             d['ownerDocument'] = self.document
0294         _append_child(self.curNode, node)
0295 
0296     def character_data_handler(self, data):
0297         childNodes = self.curNode.childNodes
0298         if childNodes and childNodes[-1].nodeType == TEXT_NODE:
0299             node = childNodes[-1]
0300             d = node.__dict__
0301             d['data'] = d['nodeValue'] = node.data + data
0302             return
0303         node = minidom.Text()
0304         d = node.__dict__
0305         d['data'] = d['nodeValue'] = node.data + data
0306         d['ownerDocument'] = self.document
0307         _append_child(self.curNode, node)
0308 
0309     def entity_decl_handler(self, entityName, is_parameter_entity, value,
0310                             base, systemId, publicId, notationName):
0311         if is_parameter_entity:
0312             # we don't care about parameter entities for the DOM
0313             return
0314         if not self._options.entities:
0315             return
0316         node = self.document._create_entity(entityName, publicId,
0317                                             systemId, notationName)
0318         if value is not None:
0319             # internal entity
0320             # node *should* be readonly, but we'll cheat
0321             child = self.document.createTextNode(value)
0322             node.childNodes.append(child)
0323         self.document.doctype.entities._seq.append(node)
0324         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
0325             del self.document.doctype.entities._seq[-1]
0326 
0327     def notation_decl_handler(self, notationName, base, systemId, publicId):
0328         node = self.document._create_notation(notationName, publicId, systemId)
0329         self.document.doctype.notations._seq.append(node)
0330         if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
0331             del self.document.doctype.notations._seq[-1]
0332 
0333     def comment_handler(self, data):
0334         node = self.document.createComment(data)
0335         _append_child(self.curNode, node)
0336         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
0337             self.curNode.removeChild(node)
0338 
0339     def start_cdata_section_handler(self):
0340         self._cdata = True
0341         self._cdata_continue = False
0342 
0343     def end_cdata_section_handler(self):
0344         self._cdata = False
0345         self._cdata_continue = False
0346 
0347     def external_entity_ref_handler(self, context, base, systemId, publicId):
0348         return 1
0349 
0350     def first_element_handler(self, name, attributes):
0351         if self._filter is None and not self._elem_info:
0352             self._finish_end_element = id
0353         self.getParser().StartElementHandler = self.start_element_handler
0354         self.start_element_handler(name, attributes)
0355 
0356     def start_element_handler(self, name, attributes):
0357         node = self.document.createElement(name)
0358         _append_child(self.curNode, node)
0359         self.curNode = node
0360 
0361         if attributes:
0362             for i in range(0, len(attributes), 2):
0363                 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
0364                                  None, EMPTY_PREFIX)
0365                 value = attributes[i+1]
0366                 d = a.childNodes[0].__dict__
0367                 d['data'] = d['nodeValue'] = value
0368                 d = a.__dict__
0369                 d['value'] = d['nodeValue'] = value
0370                 d['ownerDocument'] = self.document
0371                 _set_attribute_node(node, a)
0372 
0373         if node is not self.document.documentElement:
0374             self._finish_start_element(node)
0375 
0376     def _finish_start_element(self, node):
0377         if self._filter:
0378             # To be general, we'd have to call isSameNode(), but this
0379             # is sufficient for minidom:
0380             if node is self.document.documentElement:
0381                 return
0382             filt = self._filter.startContainer(node)
0383             if filt == FILTER_REJECT:
0384                 # ignore this node & all descendents
0385                 Rejecter(self)
0386             elif filt == FILTER_SKIP:
0387                 # ignore this node, but make it's children become
0388                 # children of the parent node
0389                 Skipper(self)
0390             else:
0391                 return
0392             self.curNode = node.parentNode
0393             node.parentNode.removeChild(node)
0394             node.unlink()
0395 
0396     # If this ever changes, Namespaces.end_element_handler() needs to
0397     # be changed to match.
0398     #
0399     def end_element_handler(self, name):
0400         curNode = self.curNode
0401         self.curNode = curNode.parentNode
0402         self._finish_end_element(curNode)
0403 
0404     def _finish_end_element(self, curNode):
0405         info = self._elem_info.get(curNode.tagName)
0406         if info:
0407             self._handle_white_text_nodes(curNode, info)
0408         if self._filter:
0409             if curNode is self.document.documentElement:
0410                 return
0411             if self._filter.acceptNode(curNode) == FILTER_REJECT:
0412                 self.curNode.removeChild(curNode)
0413                 curNode.unlink()
0414 
0415     def _handle_white_text_nodes(self, node, info):
0416         if (self._options.whitespace_in_element_content
0417             or not info.isElementContent()):
0418             return
0419 
0420         # We have element type information and should remove ignorable
0421         # whitespace; identify for text nodes which contain only
0422         # whitespace.
0423         L = []
0424         for child in node.childNodes:
0425             if child.nodeType == TEXT_NODE and not child.data.strip():
0426                 L.append(child)
0427 
0428         # Remove ignorable whitespace from the tree.
0429         for child in L:
0430             node.removeChild(child)
0431 
0432     def element_decl_handler(self, name, model):
0433         info = self._elem_info.get(name)
0434         if info is None:
0435             self._elem_info[name] = ElementInfo(name, model)
0436         else:
0437             assert info._model is None
0438             info._model = model
0439 
0440     def attlist_decl_handler(self, elem, name, type, default, required):
0441         info = self._elem_info.get(elem)
0442         if info is None:
0443             info = ElementInfo(elem)
0444             self._elem_info[elem] = info
0445         info._attr_info.append(
0446             [None, name, None, None, default, 0, type, required])
0447 
0448     def xml_decl_handler(self, version, encoding, standalone):
0449         self.document.version = version
0450         self.document.encoding = encoding
0451         # This is still a little ugly, thanks to the pyexpat API. ;-(
0452         if standalone >= 0:
0453             if standalone:
0454                 self.document.standalone = True
0455             else:
0456                 self.document.standalone = False
0457 
0458 
0459 # Don't include FILTER_INTERRUPT, since that's checked separately
0460 # where allowed.
0461 _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
0462 
0463 class FilterVisibilityController(NewStyle):
0464     """Wrapper around a DOMBuilderFilter which implements the checks
0465     to make the whatToShow filter attribute work."""
0466 
0467     __slots__ = 'filter',
0468 
0469     def __init__(self, filter):
0470         self.filter = filter
0471 
0472     def startContainer(self, node):
0473         mask = self._nodetype_mask[node.nodeType]
0474         if self.filter.whatToShow & mask:
0475             val = self.filter.startContainer(node)
0476             if val == FILTER_INTERRUPT:
0477                 raise ParseEscape
0478             if val not in _ALLOWED_FILTER_RETURNS:
0479                 raise ValueError, \
0480                       "startContainer() returned illegal value: " + repr(val)
0481             return val
0482         else:
0483             return FILTER_ACCEPT
0484 
0485     def acceptNode(self, node):
0486         mask = self._nodetype_mask[node.nodeType]
0487         if self.filter.whatToShow & mask:
0488             val = self.filter.acceptNode(node)
0489             if val == FILTER_INTERRUPT:
0490                 raise ParseEscape
0491             if val == FILTER_SKIP:
0492                 # move all child nodes to the parent, and remove this node
0493                 parent = node.parentNode
0494                 for child in node.childNodes[:]:
0495                     parent.appendChild(child)
0496                 # node is handled by the caller
0497                 return FILTER_REJECT
0498             if val not in _ALLOWED_FILTER_RETURNS:
0499                 raise ValueError, \
0500                       "acceptNode() returned illegal value: " + repr(val)
0501             return val
0502         else:
0503             return FILTER_ACCEPT
0504 
0505     _nodetype_mask = {
0506         Node.ELEMENT_NODE:                NodeFilter.SHOW_ELEMENT,
0507         Node.ATTRIBUTE_NODE:              NodeFilter.SHOW_ATTRIBUTE,
0508         Node.TEXT_NODE:                   NodeFilter.SHOW_TEXT,
0509         Node.CDATA_SECTION_NODE:          NodeFilter.SHOW_CDATA_SECTION,
0510         Node.ENTITY_REFERENCE_NODE:       NodeFilter.SHOW_ENTITY_REFERENCE,
0511         Node.ENTITY_NODE:                 NodeFilter.SHOW_ENTITY,
0512         Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
0513         Node.COMMENT_NODE:                NodeFilter.SHOW_COMMENT,
0514         Node.DOCUMENT_NODE:               NodeFilter.SHOW_DOCUMENT,
0515         Node.DOCUMENT_TYPE_NODE:          NodeFilter.SHOW_DOCUMENT_TYPE,
0516         Node.DOCUMENT_FRAGMENT_NODE:      NodeFilter.SHOW_DOCUMENT_FRAGMENT,
0517         Node.NOTATION_NODE:               NodeFilter.SHOW_NOTATION,
0518         }
0519 
0520 
0521 class FilterCrutch(NewStyle):
0522     __slots__ = '_builder', '_level', '_old_start', '_old_end'
0523 
0524     def __init__(self, builder):
0525         self._level = 0
0526         self._builder = builder
0527         parser = builder._parser
0528         self._old_start = parser.StartElementHandler
0529         self._old_end = parser.EndElementHandler
0530         parser.StartElementHandler = self.start_element_handler
0531         parser.EndElementHandler = self.end_element_handler
0532 
0533 class Rejecter(FilterCrutch):
0534     __slots__ = ()
0535 
0536     def __init__(self, builder):
0537         FilterCrutch.__init__(self, builder)
0538         parser = builder._parser
0539         for name in ("ProcessingInstructionHandler",
0540                      "CommentHandler",
0541                      "CharacterDataHandler",
0542                      "StartCdataSectionHandler",
0543                      "EndCdataSectionHandler",
0544                      "ExternalEntityRefHandler",
0545                      ):
0546             setattr(parser, name, None)
0547 
0548     def start_element_handler(self, *args):
0549         self._level = self._level + 1
0550 
0551     def end_element_handler(self, *args):
0552         if self._level == 0:
0553             # restore the old handlers
0554             parser = self._builder._parser
0555             self._builder.install(parser)
0556             parser.StartElementHandler = self._old_start
0557             parser.EndElementHandler = self._old_end
0558         else:
0559             self._level = self._level - 1
0560 
0561 class Skipper(FilterCrutch):
0562     __slots__ = ()
0563 
0564     def start_element_handler(self, *args):
0565         node = self._builder.curNode
0566         self._old_start(*args)
0567         if self._builder.curNode is not node:
0568             self._level = self._level + 1
0569 
0570     def end_element_handler(self, *args):
0571         if self._level == 0:
0572             # We're popping back out of the node we're skipping, so we
0573             # shouldn't need to do anything but reset the handlers.
0574             self._builder._parser.StartElementHandler = self._old_start
0575             self._builder._parser.EndElementHandler = self._old_end
0576             self._builder = None
0577         else:
0578             self._level = self._level - 1
0579             self._old_end(*args)
0580 
0581 
0582 # framework document used by the fragment builder.
0583 # Takes a string for the doctype, subset string, and namespace attrs string.
0584 
0585 _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
0586     "http://xml.python.org/entities/fragment-builder/internal"
0587 
0588 _FRAGMENT_BUILDER_TEMPLATE = (
0589     '''\
0590 <!DOCTYPE wrapper
0591   %%s [
0592   <!ENTITY fragment-builder-internal
0593     SYSTEM "%s">
0594 %%s
0595 ]>
0596 <wrapper %%s
0597 >&fragment-builder-internal;</wrapper>'''
0598     % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
0599 
0600 
0601 class FragmentBuilder(ExpatBuilder):
0602     """Builder which constructs document fragments given XML source
0603     text and a context node.
0604 
0605     The context node is expected to provide information about the
0606     namespace declarations which are in scope at the start of the
0607     fragment.
0608     """
0609 
0610     def __init__(self, context, options=None):
0611         if context.nodeType == DOCUMENT_NODE:
0612             self.originalDocument = context
0613             self.context = context
0614         else:
0615             self.originalDocument = context.ownerDocument
0616             self.context = context
0617         ExpatBuilder.__init__(self, options)
0618 
0619     def reset(self):
0620         ExpatBuilder.reset(self)
0621         self.fragment = None
0622 
0623     def parseFile(self, file):
0624         """Parse a document fragment from a file object, returning the
0625         fragment node."""
0626         return self.parseString(file.read())
0627 
0628     def parseString(self, string):
0629         """Parse a document fragment from a string, returning the
0630         fragment node."""
0631         self._source = string
0632         parser = self.getParser()
0633         doctype = self.originalDocument.doctype
0634         ident = ""
0635         if doctype:
0636             subset = doctype.internalSubset or self._getDeclarations()
0637             if doctype.publicId:
0638                 ident = ('PUBLIC "%s" "%s"'
0639                          % (doctype.publicId, doctype.systemId))
0640             elif doctype.systemId:
0641                 ident = 'SYSTEM "%s"' % doctype.systemId
0642         else:
0643             subset = ""
0644         nsattrs = self._getNSattrs() # get ns decls from node's ancestors
0645         document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
0646         try:
0647             parser.Parse(document, 1)
0648         except:
0649             self.reset()
0650             raise
0651         fragment = self.fragment
0652         self.reset()
0653 ##         self._parser = None
0654         return fragment
0655 
0656     def _getDeclarations(self):
0657         """Re-create the internal subset from the DocumentType node.
0658 
0659         This is only needed if we don't already have the
0660         internalSubset as a string.
0661         """
0662         doctype = self.context.ownerDocument.doctype
0663         s = ""
0664         if doctype:
0665             for i in range(doctype.notations.length):
0666                 notation = doctype.notations.item(i)
0667                 if s:
0668                     s = s + "\n  "
0669                 s = "%s<!NOTATION %s" % (s, notation.nodeName)
0670                 if notation.publicId:
0671                     s = '%s PUBLIC "%s"\n             "%s">' \
0672                         % (s, notation.publicId, notation.systemId)
0673                 else:
0674                     s = '%s SYSTEM "%s">' % (s, notation.systemId)
0675             for i in range(doctype.entities.length):
0676                 entity = doctype.entities.item(i)
0677                 if s:
0678                     s = s + "\n  "
0679                 s = "%s<!ENTITY %s" % (s, entity.nodeName)
0680                 if entity.publicId:
0681                     s = '%s PUBLIC "%s"\n             "%s"' \
0682                         % (s, entity.publicId, entity.systemId)
0683                 elif entity.systemId:
0684                     s = '%s SYSTEM "%s"' % (s, entity.systemId)
0685                 else:
0686                     s = '%s "%s"' % (s, entity.firstChild.data)
0687                 if entity.notationName:
0688                     s = "%s NOTATION %s" % (s, entity.notationName)
0689                 s = s + ">"
0690         return s
0691 
0692     def _getNSattrs(self):
0693         return ""
0694 
0695     def external_entity_ref_handler(self, context, base, systemId, publicId):
0696         if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
0697             # this entref is the one that we made to put the subtree
0698             # in; all of our given input is parsed in here.
0699             old_document = self.document
0700             old_cur_node = self.curNode
0701             parser = self._parser.ExternalEntityParserCreate(context)
0702             # put the real document back, parse into the fragment to return
0703             self.document = self.originalDocument
0704             self.fragment = self.document.createDocumentFragment()
0705             self.curNode = self.fragment
0706             try:
0707                 parser.Parse(self._source, 1)
0708             finally:
0709                 self.curNode = old_cur_node
0710                 self.document = old_document
0711                 self._source = None
0712             return -1
0713         else:
0714             return ExpatBuilder.external_entity_ref_handler(
0715                 self, context, base, systemId, publicId)
0716 
0717 
0718 class Namespaces:
0719     """Mix-in class for builders; adds support for namespaces."""
0720 
0721     def _initNamespaces(self):
0722         # list of (prefix, uri) ns declarations.  Namespace attrs are
0723         # constructed from this and added to the element's attrs.
0724         self._ns_ordered_prefixes = []
0725 
0726     def createParser(self):
0727         """Create a new namespace-handling parser."""
0728         parser = expat.ParserCreate(namespace_separator=" ")
0729         parser.namespace_prefixes = True
0730         return parser
0731 
0732     def install(self, parser):
0733         """Insert the namespace-handlers onto the parser."""
0734         ExpatBuilder.install(self, parser)
0735         if self._options.namespace_declarations:
0736             parser.StartNamespaceDeclHandler = (
0737                 self.start_namespace_decl_handler)
0738 
0739     def start_namespace_decl_handler(self, prefix, uri):
0740         """Push this namespace declaration on our storage."""
0741         self._ns_ordered_prefixes.append((prefix, uri))
0742 
0743     def start_element_handler(self, name, attributes):
0744         if ' ' in name:
0745             uri, localname, prefix, qname = _parse_ns_name(self, name)
0746         else:
0747             uri = EMPTY_NAMESPACE
0748             qname = name
0749             localname = None
0750             prefix = EMPTY_PREFIX
0751         node = minidom.Element(qname, uri, prefix, localname)
0752         node.ownerDocument = self.document
0753         _append_child(self.curNode, node)
0754         self.curNode = node
0755 
0756         if self._ns_ordered_prefixes:
0757             for prefix, uri in self._ns_ordered_prefixes:
0758                 if prefix:
0759                     a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
0760                                      XMLNS_NAMESPACE, prefix, "xmlns")
0761                 else:
0762                     a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
0763                                      "xmlns", EMPTY_PREFIX)
0764                 d = a.childNodes[0].__dict__
0765                 d['data'] = d['nodeValue'] = uri
0766                 d = a.__dict__
0767                 d['value'] = d['nodeValue'] = uri
0768                 d['ownerDocument'] = self.document
0769                 _set_attribute_node(node, a)
0770             del self._ns_ordered_prefixes[:]
0771 
0772         if attributes:
0773             _attrs = node._attrs
0774             _attrsNS = node._attrsNS
0775             for i in range(0, len(attributes), 2):
0776                 aname = attributes[i]
0777                 value = attributes[i+1]
0778                 if ' ' in aname:
0779                     uri, localname, prefix, qname = _parse_ns_name(self, aname)
0780                     a = minidom.Attr(qname, uri, localname, prefix)
0781                     _attrs[qname] = a
0782                     _attrsNS[(uri, localname)] = a
0783                 else:
0784                     a = minidom.Attr(aname, EMPTY_NAMESPACE,
0785                                      aname, EMPTY_PREFIX)
0786                     _attrs[aname] = a
0787                     _attrsNS[(EMPTY_NAMESPACE, aname)] = a
0788                 d = a.childNodes[0].__dict__
0789                 d['data'] = d['nodeValue'] = value
0790                 d = a.__dict__
0791                 d['ownerDocument'] = self.document
0792                 d['value'] = d['nodeValue'] = value
0793                 d['ownerElement'] = node
0794 
0795     if __debug__:
0796         # This only adds some asserts to the original
0797         # end_element_handler(), so we only define this when -O is not
0798         # used.  If changing one, be sure to check the other to see if
0799         # it needs to be changed as well.
0800         #
0801         def end_element_handler(self, name):
0802             curNode = self.curNode
0803             if ' ' in name:
0804                 uri, localname, prefix, qname = _parse_ns_name(self, name)
0805                 assert (curNode.namespaceURI == uri
0806                         and curNode.localName == localname
0807                         and curNode.prefix == prefix), \
0808                         "element stack messed up! (namespace)"
0809             else:
0810                 assert curNode.nodeName == name, \
0811                        "element stack messed up - bad nodeName"
0812                 assert curNode.namespaceURI == EMPTY_NAMESPACE, \
0813                        "element stack messed up - bad namespaceURI"
0814             self.curNode = curNode.parentNode
0815             self._finish_end_element(curNode)
0816 
0817 
0818 class ExpatBuilderNS(Namespaces, ExpatBuilder):
0819     """Document builder that supports namespaces."""
0820 
0821     def reset(self):
0822         ExpatBuilder.reset(self)
0823         self._initNamespaces()
0824 
0825 
0826 class FragmentBuilderNS(Namespaces, FragmentBuilder):
0827     """Fragment builder that supports namespaces."""
0828 
0829     def reset(self):
0830         FragmentBuilder.reset(self)
0831         self._initNamespaces()
0832 
0833     def _getNSattrs(self):
0834         """Return string of namespace attributes from this element and
0835         ancestors."""
0836         # XXX This needs to be re-written to walk the ancestors of the
0837         # context to build up the namespace information from
0838         # declarations, elements, and attributes found in context.
0839         # Otherwise we have to store a bunch more data on the DOM
0840         # (though that *might* be more reliable -- not clear).
0841         attrs = ""
0842         context = self.context
0843         L = []
0844         while context:
0845             if hasattr(context, '_ns_prefix_uri'):
0846                 for prefix, uri in context._ns_prefix_uri.items():
0847                     # add every new NS decl from context to L and attrs string
0848                     if prefix in L:
0849                         continue
0850                     L.append(prefix)
0851                     if prefix:
0852                         declname = "xmlns:" + prefix
0853                     else:
0854                         declname = "xmlns"
0855                     if attrs:
0856                         attrs = "%s\n    %s='%s'" % (attrs, declname, uri)
0857                     else:
0858                         attrs = " %s='%s'" % (declname, uri)
0859             context = context.parentNode
0860         return attrs
0861 
0862 
0863 class ParseEscape(Exception):
0864     """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
0865     pass
0866 
0867 class InternalSubsetExtractor(ExpatBuilder):
0868     """XML processor which can rip out the internal document type subset."""
0869 
0870     subset = None
0871 
0872     def getSubset(self):
0873         """Return the internal subset as a string."""
0874         return self.subset
0875 
0876     def parseFile(self, file):
0877         try:
0878             ExpatBuilder.parseFile(self, file)
0879         except ParseEscape:
0880             pass
0881 
0882     def parseString(self, string):
0883         try:
0884             ExpatBuilder.parseString(self, string)
0885         except ParseEscape:
0886             pass
0887 
0888     def install(self, parser):
0889         parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
0890         parser.StartElementHandler = self.start_element_handler
0891 
0892     def start_doctype_decl_handler(self, name, publicId, systemId,
0893                                    has_internal_subset):
0894         if has_internal_subset:
0895             parser = self.getParser()
0896             self.subset = []
0897             parser.DefaultHandler = self.subset.append
0898             parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
0899         else:
0900             raise ParseEscape()
0901 
0902     def end_doctype_decl_handler(self):
0903         s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
0904         self.subset = s
0905         raise ParseEscape()
0906 
0907     def start_element_handler(self, name, attrs):
0908         raise ParseEscape()
0909 
0910 
0911 def parse(file, namespaces=1):
0912     """Parse a document, returning the resulting Document node.
0913 
0914     'file' may be either a file name or an open file object.
0915     """
0916     if namespaces:
0917         builder = ExpatBuilderNS()
0918     else:
0919         builder = ExpatBuilder()
0920 
0921     if isinstance(file, StringTypes):
0922         fp = open(file, 'rb')
0923         try:
0924             result = builder.parseFile(fp)
0925         finally:
0926             fp.close()
0927     else:
0928         result = builder.parseFile(file)
0929     return result
0930 
0931 
0932 def parseString(string, namespaces=1):
0933     """Parse a document from a string, returning the resulting
0934     Document node.
0935     """
0936     if namespaces:
0937         builder = ExpatBuilderNS()
0938     else:
0939         builder = ExpatBuilder()
0940     return builder.parseString(string)
0941 
0942 
0943 def parseFragment(file, context, namespaces=1):
0944     """Parse a fragment of a document, given the context from which it
0945     was originally extracted.  context should be the parent of the
0946     node(s) which are in the fragment.
0947 
0948     'file' may be either a file name or an open file object.
0949     """
0950     if namespaces:
0951         builder = FragmentBuilderNS(context)
0952     else:
0953         builder = FragmentBuilder(context)
0954 
0955     if isinstance(file, StringTypes):
0956         fp = open(file, 'rb')
0957         try:
0958             result = builder.parseFile(fp)
0959         finally:
0960             fp.close()
0961     else:
0962         result = builder.parseFile(file)
0963     return result
0964 
0965 
0966 def parseFragmentString(string, context, namespaces=1):
0967     """Parse a fragment of a document from a string, given the context
0968     from which it was originally extracted.  context should be the
0969     parent of the node(s) which are in the fragment.
0970     """
0971     if namespaces:
0972         builder = FragmentBuilderNS(context)
0973     else:
0974         builder = FragmentBuilder(context)
0975     return builder.parseString(string)
0976 
0977 
0978 def makeBuilder(options):
0979     """Create a builder based on an Options object."""
0980     if options.namespaces:
0981         return ExpatBuilderNS(options)
0982     else:
0983         return ExpatBuilder(options)
0984 

Generated by PyXR 0.9.4
SourceForge.net Logo