0001 """Facility to use the Expat parser to load a minidom instance 0002 from a string or file. 0003 0004 This avoids all the overhead of SAX and pulldom to gain performance. 0005 """ 0006 0007 # Warning! 0008 # 0009 # This module is tightly bound to the implementation details of the 0010 # minidom DOM and can't be used with other DOM implementations. This 0011 # is due, in part, to a lack of appropriate methods in the DOM (there is 0012 # no way to create Entity and Notation nodes via the DOM Level 2 0013 # interface), and for performance. The later is the cause of some fairly 0014 # cryptic code. 0015 # 0016 # Performance hacks: 0017 # 0018 # - .character_data_handler() has an extra case in which continuing 0019 # data is appended to an existing Text node; this can be a 0020 # speedup since pyexpat can break up character data into multiple 0021 # callbacks even though we set the buffer_text attribute on the 0022 # parser. This also gives us the advantage that we don't need a 0023 # separate normalization pass. 0024 # 0025 # - Determining that a node exists is done using an identity comparison 0026 # with None rather than a truth test; this avoids searching for and 0027 # calling any methods on the node object if it exists. (A rather 0028 # nice speedup is achieved this way as well!) 0029 0030 from xml.dom import xmlbuilder, minidom, Node 0031 from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE 0032 from xml.parsers import expat 0033 from xml.dom.minidom import _append_child, _set_attribute_node 0034 from xml.dom.NodeFilter import NodeFilter 0035 0036 from xml.dom.minicompat import * 0037 0038 TEXT_NODE = Node.TEXT_NODE 0039 CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE 0040 DOCUMENT_NODE = Node.DOCUMENT_NODE 0041 0042 FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT 0043 FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT 0044 FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP 0045 FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT 0046 0047 theDOMImplementation = minidom.getDOMImplementation() 0048 0049 # Expat typename -> TypeInfo 0050 _typeinfo_map = { 0051 "CDATA": minidom.TypeInfo(None, "cdata"), 0052 "ENUM": minidom.TypeInfo(None, "enumeration"), 0053 "ENTITY": minidom.TypeInfo(None, "entity"), 0054 "ENTITIES": minidom.TypeInfo(None, "entities"), 0055 "ID": minidom.TypeInfo(None, "id"), 0056 "IDREF": minidom.TypeInfo(None, "idref"), 0057 "IDREFS": minidom.TypeInfo(None, "idrefs"), 0058 "NMTOKEN": minidom.TypeInfo(None, "nmtoken"), 0059 "NMTOKENS": minidom.TypeInfo(None, "nmtokens"), 0060 } 0061 0062 class ElementInfo(NewStyle): 0063 __slots__ = '_attr_info', '_model', 'tagName' 0064 0065 def __init__(self, tagName, model=None): 0066 self.tagName = tagName 0067 self._attr_info = [] 0068 self._model = model 0069 0070 def __getstate__(self): 0071 return self._attr_info, self._model, self.tagName 0072 0073 def __setstate__(self, state): 0074 self._attr_info, self._model, self.tagName = state 0075 0076 def getAttributeType(self, aname): 0077 for info in self._attr_info: 0078 if info[1] == aname: 0079 t = info[-2] 0080 if t[0] == "(": 0081 return _typeinfo_map["ENUM"] 0082 else: 0083 return _typeinfo_map[info[-2]] 0084 return minidom._no_type 0085 0086 def getAttributeTypeNS(self, namespaceURI, localName): 0087 return minidom._no_type 0088 0089 def isElementContent(self): 0090 if self._model: 0091 type = self._model[0] 0092 return type not in (expat.model.XML_CTYPE_ANY, 0093 expat.model.XML_CTYPE_MIXED) 0094 else: 0095 return False 0096 0097 def isEmpty(self): 0098 if self._model: 0099 return self._model[0] == expat.model.XML_CTYPE_EMPTY 0100 else: 0101 return False 0102 0103 def isId(self, aname): 0104 for info in self._attr_info: 0105 if info[1] == aname: 0106 return info[-2] == "ID" 0107 return False 0108 0109 def isIdNS(self, euri, ename, auri, aname): 0110 # not sure this is meaningful 0111 return self.isId((auri, aname)) 0112 0113 def _intern(builder, s): 0114 return builder._intern_setdefault(s, s) 0115 0116 def _parse_ns_name(builder, name): 0117 assert ' ' in name 0118 parts = name.split(' ') 0119 intern = builder._intern_setdefault 0120 if len(parts) == 3: 0121 uri, localname, prefix = parts 0122 prefix = intern(prefix, prefix) 0123 qname = "%s:%s" % (prefix, localname) 0124 qname = intern(qname, qname) 0125 localname = intern(localname, localname) 0126 else: 0127 uri, localname = parts 0128 prefix = EMPTY_PREFIX 0129 qname = localname = intern(localname, localname) 0130 return intern(uri, uri), localname, prefix, qname 0131 0132 0133 class ExpatBuilder: 0134 """Document builder that uses Expat to build a ParsedXML.DOM document 0135 instance.""" 0136 0137 def __init__(self, options=None): 0138 if options is None: 0139 options = xmlbuilder.Options() 0140 self._options = options 0141 if self._options.filter is not None: 0142 self._filter = FilterVisibilityController(self._options.filter) 0143 else: 0144 self._filter = None 0145 # This *really* doesn't do anything in this case, so 0146 # override it with something fast & minimal. 0147 self._finish_start_element = id 0148 self._parser = None 0149 self.reset() 0150 0151 def createParser(self): 0152 """Create a new parser object.""" 0153 return expat.ParserCreate() 0154 0155 def getParser(self): 0156 """Return the parser object, creating a new one if needed.""" 0157 if not self._parser: 0158 self._parser = self.createParser() 0159 self._intern_setdefault = self._parser.intern.setdefault 0160 self._parser.buffer_text = True 0161 self._parser.ordered_attributes = True 0162 self._parser.specified_attributes = True 0163 self.install(self._parser) 0164 return self._parser 0165 0166 def reset(self): 0167 """Free all data structures used during DOM construction.""" 0168 self.document = theDOMImplementation.createDocument( 0169 EMPTY_NAMESPACE, None, None) 0170 self.curNode = self.document 0171 self._elem_info = self.document._elem_info 0172 self._cdata = False 0173 0174 def install(self, parser): 0175 """Install the callbacks needed to build the DOM into the parser.""" 0176 # This creates circular references! 0177 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler 0178 parser.StartElementHandler = self.first_element_handler 0179 parser.EndElementHandler = self.end_element_handler 0180 parser.ProcessingInstructionHandler = self.pi_handler 0181 if self._options.entities: 0182 parser.EntityDeclHandler = self.entity_decl_handler 0183 parser.NotationDeclHandler = self.notation_decl_handler 0184 if self._options.comments: 0185 parser.CommentHandler = self.comment_handler 0186 if self._options.cdata_sections: 0187 parser.StartCdataSectionHandler = self.start_cdata_section_handler 0188 parser.EndCdataSectionHandler = self.end_cdata_section_handler 0189 parser.CharacterDataHandler = self.character_data_handler_cdata 0190 else: 0191 parser.CharacterDataHandler = self.character_data_handler 0192 parser.ExternalEntityRefHandler = self.external_entity_ref_handler 0193 parser.XmlDeclHandler = self.xml_decl_handler 0194 parser.ElementDeclHandler = self.element_decl_handler 0195 parser.AttlistDeclHandler = self.attlist_decl_handler 0196 0197 def parseFile(self, file): 0198 """Parse a document from a file object, returning the document 0199 node.""" 0200 parser = self.getParser() 0201 first_buffer = True 0202 try: 0203 while 1: 0204 buffer = file.read(16*1024) 0205 if not buffer: 0206 break 0207 parser.Parse(buffer, 0) 0208 if first_buffer and self.document.documentElement: 0209 self._setup_subset(buffer) 0210 first_buffer = False 0211 parser.Parse("", True) 0212 except ParseEscape: 0213 pass 0214 doc = self.document 0215 self.reset() 0216 self._parser = None 0217 return doc 0218 0219 def parseString(self, string): 0220 """Parse a document from a string, returning the document node.""" 0221 parser = self.getParser() 0222 try: 0223 parser.Parse(string, True) 0224 self._setup_subset(string) 0225 except ParseEscape: 0226 pass 0227 doc = self.document 0228 self.reset() 0229 self._parser = None 0230 return doc 0231 0232 def _setup_subset(self, buffer): 0233 """Load the internal subset if there might be one.""" 0234 if self.document.doctype: 0235 extractor = InternalSubsetExtractor() 0236 extractor.parseString(buffer) 0237 subset = extractor.getSubset() 0238 self.document.doctype.internalSubset = subset 0239 0240 def start_doctype_decl_handler(self, doctypeName, systemId, publicId, 0241 has_internal_subset): 0242 doctype = self.document.implementation.createDocumentType( 0243 doctypeName, publicId, systemId) 0244 doctype.ownerDocument = self.document 0245 self.document.childNodes.append(doctype) 0246 self.document.doctype = doctype 0247 if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT: 0248 self.document.doctype = None 0249 del self.document.childNodes[-1] 0250 doctype = None 0251 self._parser.EntityDeclHandler = None 0252 self._parser.NotationDeclHandler = None 0253 if has_internal_subset: 0254 if doctype is not None: 0255 doctype.entities._seq = [] 0256 doctype.notations._seq = [] 0257 self._parser.CommentHandler = None 0258 self._parser.ProcessingInstructionHandler = None 0259 self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler 0260 0261 def end_doctype_decl_handler(self): 0262 if self._options.comments: 0263 self._parser.CommentHandler = self.comment_handler 0264 self._parser.ProcessingInstructionHandler = self.pi_handler 0265 if not (self._elem_info or self._filter): 0266 self._finish_end_element = id 0267 0268 def pi_handler(self, target, data): 0269 node = self.document.createProcessingInstruction(target, data) 0270 _append_child(self.curNode, node) 0271 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 0272 self.curNode.removeChild(node) 0273 0274 def character_data_handler_cdata(self, data): 0275 childNodes = self.curNode.childNodes 0276 if self._cdata: 0277 if ( self._cdata_continue 0278 and childNodes[-1].nodeType == CDATA_SECTION_NODE): 0279 childNodes[-1].appendData(data) 0280 return 0281 node = self.document.createCDATASection(data) 0282 self._cdata_continue = True 0283 elif childNodes and childNodes[-1].nodeType == TEXT_NODE: 0284 node = childNodes[-1] 0285 value = node.data + data 0286 d = node.__dict__ 0287 d['data'] = d['nodeValue'] = value 0288 return 0289 else: 0290 node = minidom.Text() 0291 d = node.__dict__ 0292 d['data'] = d['nodeValue'] = data 0293 d['ownerDocument'] = self.document 0294 _append_child(self.curNode, node) 0295 0296 def character_data_handler(self, data): 0297 childNodes = self.curNode.childNodes 0298 if childNodes and childNodes[-1].nodeType == TEXT_NODE: 0299 node = childNodes[-1] 0300 d = node.__dict__ 0301 d['data'] = d['nodeValue'] = node.data + data 0302 return 0303 node = minidom.Text() 0304 d = node.__dict__ 0305 d['data'] = d['nodeValue'] = node.data + data 0306 d['ownerDocument'] = self.document 0307 _append_child(self.curNode, node) 0308 0309 def entity_decl_handler(self, entityName, is_parameter_entity, value, 0310 base, systemId, publicId, notationName): 0311 if is_parameter_entity: 0312 # we don't care about parameter entities for the DOM 0313 return 0314 if not self._options.entities: 0315 return 0316 node = self.document._create_entity(entityName, publicId, 0317 systemId, notationName) 0318 if value is not None: 0319 # internal entity 0320 # node *should* be readonly, but we'll cheat 0321 child = self.document.createTextNode(value) 0322 node.childNodes.append(child) 0323 self.document.doctype.entities._seq.append(node) 0324 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 0325 del self.document.doctype.entities._seq[-1] 0326 0327 def notation_decl_handler(self, notationName, base, systemId, publicId): 0328 node = self.document._create_notation(notationName, publicId, systemId) 0329 self.document.doctype.notations._seq.append(node) 0330 if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT: 0331 del self.document.doctype.notations._seq[-1] 0332 0333 def comment_handler(self, data): 0334 node = self.document.createComment(data) 0335 _append_child(self.curNode, node) 0336 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 0337 self.curNode.removeChild(node) 0338 0339 def start_cdata_section_handler(self): 0340 self._cdata = True 0341 self._cdata_continue = False 0342 0343 def end_cdata_section_handler(self): 0344 self._cdata = False 0345 self._cdata_continue = False 0346 0347 def external_entity_ref_handler(self, context, base, systemId, publicId): 0348 return 1 0349 0350 def first_element_handler(self, name, attributes): 0351 if self._filter is None and not self._elem_info: 0352 self._finish_end_element = id 0353 self.getParser().StartElementHandler = self.start_element_handler 0354 self.start_element_handler(name, attributes) 0355 0356 def start_element_handler(self, name, attributes): 0357 node = self.document.createElement(name) 0358 _append_child(self.curNode, node) 0359 self.curNode = node 0360 0361 if attributes: 0362 for i in range(0, len(attributes), 2): 0363 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE, 0364 None, EMPTY_PREFIX) 0365 value = attributes[i+1] 0366 d = a.childNodes[0].__dict__ 0367 d['data'] = d['nodeValue'] = value 0368 d = a.__dict__ 0369 d['value'] = d['nodeValue'] = value 0370 d['ownerDocument'] = self.document 0371 _set_attribute_node(node, a) 0372 0373 if node is not self.document.documentElement: 0374 self._finish_start_element(node) 0375 0376 def _finish_start_element(self, node): 0377 if self._filter: 0378 # To be general, we'd have to call isSameNode(), but this 0379 # is sufficient for minidom: 0380 if node is self.document.documentElement: 0381 return 0382 filt = self._filter.startContainer(node) 0383 if filt == FILTER_REJECT: 0384 # ignore this node & all descendents 0385 Rejecter(self) 0386 elif filt == FILTER_SKIP: 0387 # ignore this node, but make it's children become 0388 # children of the parent node 0389 Skipper(self) 0390 else: 0391 return 0392 self.curNode = node.parentNode 0393 node.parentNode.removeChild(node) 0394 node.unlink() 0395 0396 # If this ever changes, Namespaces.end_element_handler() needs to 0397 # be changed to match. 0398 # 0399 def end_element_handler(self, name): 0400 curNode = self.curNode 0401 self.curNode = curNode.parentNode 0402 self._finish_end_element(curNode) 0403 0404 def _finish_end_element(self, curNode): 0405 info = self._elem_info.get(curNode.tagName) 0406 if info: 0407 self._handle_white_text_nodes(curNode, info) 0408 if self._filter: 0409 if curNode is self.document.documentElement: 0410 return 0411 if self._filter.acceptNode(curNode) == FILTER_REJECT: 0412 self.curNode.removeChild(curNode) 0413 curNode.unlink() 0414 0415 def _handle_white_text_nodes(self, node, info): 0416 if (self._options.whitespace_in_element_content 0417 or not info.isElementContent()): 0418 return 0419 0420 # We have element type information and should remove ignorable 0421 # whitespace; identify for text nodes which contain only 0422 # whitespace. 0423 L = [] 0424 for child in node.childNodes: 0425 if child.nodeType == TEXT_NODE and not child.data.strip(): 0426 L.append(child) 0427 0428 # Remove ignorable whitespace from the tree. 0429 for child in L: 0430 node.removeChild(child) 0431 0432 def element_decl_handler(self, name, model): 0433 info = self._elem_info.get(name) 0434 if info is None: 0435 self._elem_info[name] = ElementInfo(name, model) 0436 else: 0437 assert info._model is None 0438 info._model = model 0439 0440 def attlist_decl_handler(self, elem, name, type, default, required): 0441 info = self._elem_info.get(elem) 0442 if info is None: 0443 info = ElementInfo(elem) 0444 self._elem_info[elem] = info 0445 info._attr_info.append( 0446 [None, name, None, None, default, 0, type, required]) 0447 0448 def xml_decl_handler(self, version, encoding, standalone): 0449 self.document.version = version 0450 self.document.encoding = encoding 0451 # This is still a little ugly, thanks to the pyexpat API. ;-( 0452 if standalone >= 0: 0453 if standalone: 0454 self.document.standalone = True 0455 else: 0456 self.document.standalone = False 0457 0458 0459 # Don't include FILTER_INTERRUPT, since that's checked separately 0460 # where allowed. 0461 _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP) 0462 0463 class FilterVisibilityController(NewStyle): 0464 """Wrapper around a DOMBuilderFilter which implements the checks 0465 to make the whatToShow filter attribute work.""" 0466 0467 __slots__ = 'filter', 0468 0469 def __init__(self, filter): 0470 self.filter = filter 0471 0472 def startContainer(self, node): 0473 mask = self._nodetype_mask[node.nodeType] 0474 if self.filter.whatToShow & mask: 0475 val = self.filter.startContainer(node) 0476 if val == FILTER_INTERRUPT: 0477 raise ParseEscape 0478 if val not in _ALLOWED_FILTER_RETURNS: 0479 raise ValueError, \ 0480 "startContainer() returned illegal value: " + repr(val) 0481 return val 0482 else: 0483 return FILTER_ACCEPT 0484 0485 def acceptNode(self, node): 0486 mask = self._nodetype_mask[node.nodeType] 0487 if self.filter.whatToShow & mask: 0488 val = self.filter.acceptNode(node) 0489 if val == FILTER_INTERRUPT: 0490 raise ParseEscape 0491 if val == FILTER_SKIP: 0492 # move all child nodes to the parent, and remove this node 0493 parent = node.parentNode 0494 for child in node.childNodes[:]: 0495 parent.appendChild(child) 0496 # node is handled by the caller 0497 return FILTER_REJECT 0498 if val not in _ALLOWED_FILTER_RETURNS: 0499 raise ValueError, \ 0500 "acceptNode() returned illegal value: " + repr(val) 0501 return val 0502 else: 0503 return FILTER_ACCEPT 0504 0505 _nodetype_mask = { 0506 Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT, 0507 Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE, 0508 Node.TEXT_NODE: NodeFilter.SHOW_TEXT, 0509 Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION, 0510 Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE, 0511 Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY, 0512 Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION, 0513 Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT, 0514 Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT, 0515 Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE, 0516 Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT, 0517 Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION, 0518 } 0519 0520 0521 class FilterCrutch(NewStyle): 0522 __slots__ = '_builder', '_level', '_old_start', '_old_end' 0523 0524 def __init__(self, builder): 0525 self._level = 0 0526 self._builder = builder 0527 parser = builder._parser 0528 self._old_start = parser.StartElementHandler 0529 self._old_end = parser.EndElementHandler 0530 parser.StartElementHandler = self.start_element_handler 0531 parser.EndElementHandler = self.end_element_handler 0532 0533 class Rejecter(FilterCrutch): 0534 __slots__ = () 0535 0536 def __init__(self, builder): 0537 FilterCrutch.__init__(self, builder) 0538 parser = builder._parser 0539 for name in ("ProcessingInstructionHandler", 0540 "CommentHandler", 0541 "CharacterDataHandler", 0542 "StartCdataSectionHandler", 0543 "EndCdataSectionHandler", 0544 "ExternalEntityRefHandler", 0545 ): 0546 setattr(parser, name, None) 0547 0548 def start_element_handler(self, *args): 0549 self._level = self._level + 1 0550 0551 def end_element_handler(self, *args): 0552 if self._level == 0: 0553 # restore the old handlers 0554 parser = self._builder._parser 0555 self._builder.install(parser) 0556 parser.StartElementHandler = self._old_start 0557 parser.EndElementHandler = self._old_end 0558 else: 0559 self._level = self._level - 1 0560 0561 class Skipper(FilterCrutch): 0562 __slots__ = () 0563 0564 def start_element_handler(self, *args): 0565 node = self._builder.curNode 0566 self._old_start(*args) 0567 if self._builder.curNode is not node: 0568 self._level = self._level + 1 0569 0570 def end_element_handler(self, *args): 0571 if self._level == 0: 0572 # We're popping back out of the node we're skipping, so we 0573 # shouldn't need to do anything but reset the handlers. 0574 self._builder._parser.StartElementHandler = self._old_start 0575 self._builder._parser.EndElementHandler = self._old_end 0576 self._builder = None 0577 else: 0578 self._level = self._level - 1 0579 self._old_end(*args) 0580 0581 0582 # framework document used by the fragment builder. 0583 # Takes a string for the doctype, subset string, and namespace attrs string. 0584 0585 _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \ 0586 "http://xml.python.org/entities/fragment-builder/internal" 0587 0588 _FRAGMENT_BUILDER_TEMPLATE = ( 0589 '''\ 0590 <!DOCTYPE wrapper 0591 %%s [ 0592 <!ENTITY fragment-builder-internal 0593 SYSTEM "%s"> 0594 %%s 0595 ]> 0596 <wrapper %%s 0597 >&fragment-builder-internal;</wrapper>''' 0598 % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID) 0599 0600 0601 class FragmentBuilder(ExpatBuilder): 0602 """Builder which constructs document fragments given XML source 0603 text and a context node. 0604 0605 The context node is expected to provide information about the 0606 namespace declarations which are in scope at the start of the 0607 fragment. 0608 """ 0609 0610 def __init__(self, context, options=None): 0611 if context.nodeType == DOCUMENT_NODE: 0612 self.originalDocument = context 0613 self.context = context 0614 else: 0615 self.originalDocument = context.ownerDocument 0616 self.context = context 0617 ExpatBuilder.__init__(self, options) 0618 0619 def reset(self): 0620 ExpatBuilder.reset(self) 0621 self.fragment = None 0622 0623 def parseFile(self, file): 0624 """Parse a document fragment from a file object, returning the 0625 fragment node.""" 0626 return self.parseString(file.read()) 0627 0628 def parseString(self, string): 0629 """Parse a document fragment from a string, returning the 0630 fragment node.""" 0631 self._source = string 0632 parser = self.getParser() 0633 doctype = self.originalDocument.doctype 0634 ident = "" 0635 if doctype: 0636 subset = doctype.internalSubset or self._getDeclarations() 0637 if doctype.publicId: 0638 ident = ('PUBLIC "%s" "%s"' 0639 % (doctype.publicId, doctype.systemId)) 0640 elif doctype.systemId: 0641 ident = 'SYSTEM "%s"' % doctype.systemId 0642 else: 0643 subset = "" 0644 nsattrs = self._getNSattrs() # get ns decls from node's ancestors 0645 document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs) 0646 try: 0647 parser.Parse(document, 1) 0648 except: 0649 self.reset() 0650 raise 0651 fragment = self.fragment 0652 self.reset() 0653 ## self._parser = None 0654 return fragment 0655 0656 def _getDeclarations(self): 0657 """Re-create the internal subset from the DocumentType node. 0658 0659 This is only needed if we don't already have the 0660 internalSubset as a string. 0661 """ 0662 doctype = self.context.ownerDocument.doctype 0663 s = "" 0664 if doctype: 0665 for i in range(doctype.notations.length): 0666 notation = doctype.notations.item(i) 0667 if s: 0668 s = s + "\n " 0669 s = "%s<!NOTATION %s" % (s, notation.nodeName) 0670 if notation.publicId: 0671 s = '%s PUBLIC "%s"\n "%s">' \ 0672 % (s, notation.publicId, notation.systemId) 0673 else: 0674 s = '%s SYSTEM "%s">' % (s, notation.systemId) 0675 for i in range(doctype.entities.length): 0676 entity = doctype.entities.item(i) 0677 if s: 0678 s = s + "\n " 0679 s = "%s<!ENTITY %s" % (s, entity.nodeName) 0680 if entity.publicId: 0681 s = '%s PUBLIC "%s"\n "%s"' \ 0682 % (s, entity.publicId, entity.systemId) 0683 elif entity.systemId: 0684 s = '%s SYSTEM "%s"' % (s, entity.systemId) 0685 else: 0686 s = '%s "%s"' % (s, entity.firstChild.data) 0687 if entity.notationName: 0688 s = "%s NOTATION %s" % (s, entity.notationName) 0689 s = s + ">" 0690 return s 0691 0692 def _getNSattrs(self): 0693 return "" 0694 0695 def external_entity_ref_handler(self, context, base, systemId, publicId): 0696 if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID: 0697 # this entref is the one that we made to put the subtree 0698 # in; all of our given input is parsed in here. 0699 old_document = self.document 0700 old_cur_node = self.curNode 0701 parser = self._parser.ExternalEntityParserCreate(context) 0702 # put the real document back, parse into the fragment to return 0703 self.document = self.originalDocument 0704 self.fragment = self.document.createDocumentFragment() 0705 self.curNode = self.fragment 0706 try: 0707 parser.Parse(self._source, 1) 0708 finally: 0709 self.curNode = old_cur_node 0710 self.document = old_document 0711 self._source = None 0712 return -1 0713 else: 0714 return ExpatBuilder.external_entity_ref_handler( 0715 self, context, base, systemId, publicId) 0716 0717 0718 class Namespaces: 0719 """Mix-in class for builders; adds support for namespaces.""" 0720 0721 def _initNamespaces(self): 0722 # list of (prefix, uri) ns declarations. Namespace attrs are 0723 # constructed from this and added to the element's attrs. 0724 self._ns_ordered_prefixes = [] 0725 0726 def createParser(self): 0727 """Create a new namespace-handling parser.""" 0728 parser = expat.ParserCreate(namespace_separator=" ") 0729 parser.namespace_prefixes = True 0730 return parser 0731 0732 def install(self, parser): 0733 """Insert the namespace-handlers onto the parser.""" 0734 ExpatBuilder.install(self, parser) 0735 if self._options.namespace_declarations: 0736 parser.StartNamespaceDeclHandler = ( 0737 self.start_namespace_decl_handler) 0738 0739 def start_namespace_decl_handler(self, prefix, uri): 0740 """Push this namespace declaration on our storage.""" 0741 self._ns_ordered_prefixes.append((prefix, uri)) 0742 0743 def start_element_handler(self, name, attributes): 0744 if ' ' in name: 0745 uri, localname, prefix, qname = _parse_ns_name(self, name) 0746 else: 0747 uri = EMPTY_NAMESPACE 0748 qname = name 0749 localname = None 0750 prefix = EMPTY_PREFIX 0751 node = minidom.Element(qname, uri, prefix, localname) 0752 node.ownerDocument = self.document 0753 _append_child(self.curNode, node) 0754 self.curNode = node 0755 0756 if self._ns_ordered_prefixes: 0757 for prefix, uri in self._ns_ordered_prefixes: 0758 if prefix: 0759 a = minidom.Attr(_intern(self, 'xmlns:' + prefix), 0760 XMLNS_NAMESPACE, prefix, "xmlns") 0761 else: 0762 a = minidom.Attr("xmlns", XMLNS_NAMESPACE, 0763 "xmlns", EMPTY_PREFIX) 0764 d = a.childNodes[0].__dict__ 0765 d['data'] = d['nodeValue'] = uri 0766 d = a.__dict__ 0767 d['value'] = d['nodeValue'] = uri 0768 d['ownerDocument'] = self.document 0769 _set_attribute_node(node, a) 0770 del self._ns_ordered_prefixes[:] 0771 0772 if attributes: 0773 _attrs = node._attrs 0774 _attrsNS = node._attrsNS 0775 for i in range(0, len(attributes), 2): 0776 aname = attributes[i] 0777 value = attributes[i+1] 0778 if ' ' in aname: 0779 uri, localname, prefix, qname = _parse_ns_name(self, aname) 0780 a = minidom.Attr(qname, uri, localname, prefix) 0781 _attrs[qname] = a 0782 _attrsNS[(uri, localname)] = a 0783 else: 0784 a = minidom.Attr(aname, EMPTY_NAMESPACE, 0785 aname, EMPTY_PREFIX) 0786 _attrs[aname] = a 0787 _attrsNS[(EMPTY_NAMESPACE, aname)] = a 0788 d = a.childNodes[0].__dict__ 0789 d['data'] = d['nodeValue'] = value 0790 d = a.__dict__ 0791 d['ownerDocument'] = self.document 0792 d['value'] = d['nodeValue'] = value 0793 d['ownerElement'] = node 0794 0795 if __debug__: 0796 # This only adds some asserts to the original 0797 # end_element_handler(), so we only define this when -O is not 0798 # used. If changing one, be sure to check the other to see if 0799 # it needs to be changed as well. 0800 # 0801 def end_element_handler(self, name): 0802 curNode = self.curNode 0803 if ' ' in name: 0804 uri, localname, prefix, qname = _parse_ns_name(self, name) 0805 assert (curNode.namespaceURI == uri 0806 and curNode.localName == localname 0807 and curNode.prefix == prefix), \ 0808 "element stack messed up! (namespace)" 0809 else: 0810 assert curNode.nodeName == name, \ 0811 "element stack messed up - bad nodeName" 0812 assert curNode.namespaceURI == EMPTY_NAMESPACE, \ 0813 "element stack messed up - bad namespaceURI" 0814 self.curNode = curNode.parentNode 0815 self._finish_end_element(curNode) 0816 0817 0818 class ExpatBuilderNS(Namespaces, ExpatBuilder): 0819 """Document builder that supports namespaces.""" 0820 0821 def reset(self): 0822 ExpatBuilder.reset(self) 0823 self._initNamespaces() 0824 0825 0826 class FragmentBuilderNS(Namespaces, FragmentBuilder): 0827 """Fragment builder that supports namespaces.""" 0828 0829 def reset(self): 0830 FragmentBuilder.reset(self) 0831 self._initNamespaces() 0832 0833 def _getNSattrs(self): 0834 """Return string of namespace attributes from this element and 0835 ancestors.""" 0836 # XXX This needs to be re-written to walk the ancestors of the 0837 # context to build up the namespace information from 0838 # declarations, elements, and attributes found in context. 0839 # Otherwise we have to store a bunch more data on the DOM 0840 # (though that *might* be more reliable -- not clear). 0841 attrs = "" 0842 context = self.context 0843 L = [] 0844 while context: 0845 if hasattr(context, '_ns_prefix_uri'): 0846 for prefix, uri in context._ns_prefix_uri.items(): 0847 # add every new NS decl from context to L and attrs string 0848 if prefix in L: 0849 continue 0850 L.append(prefix) 0851 if prefix: 0852 declname = "xmlns:" + prefix 0853 else: 0854 declname = "xmlns" 0855 if attrs: 0856 attrs = "%s\n %s='%s'" % (attrs, declname, uri) 0857 else: 0858 attrs = " %s='%s'" % (declname, uri) 0859 context = context.parentNode 0860 return attrs 0861 0862 0863 class ParseEscape(Exception): 0864 """Exception raised to short-circuit parsing in InternalSubsetExtractor.""" 0865 pass 0866 0867 class InternalSubsetExtractor(ExpatBuilder): 0868 """XML processor which can rip out the internal document type subset.""" 0869 0870 subset = None 0871 0872 def getSubset(self): 0873 """Return the internal subset as a string.""" 0874 return self.subset 0875 0876 def parseFile(self, file): 0877 try: 0878 ExpatBuilder.parseFile(self, file) 0879 except ParseEscape: 0880 pass 0881 0882 def parseString(self, string): 0883 try: 0884 ExpatBuilder.parseString(self, string) 0885 except ParseEscape: 0886 pass 0887 0888 def install(self, parser): 0889 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler 0890 parser.StartElementHandler = self.start_element_handler 0891 0892 def start_doctype_decl_handler(self, name, publicId, systemId, 0893 has_internal_subset): 0894 if has_internal_subset: 0895 parser = self.getParser() 0896 self.subset = [] 0897 parser.DefaultHandler = self.subset.append 0898 parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler 0899 else: 0900 raise ParseEscape() 0901 0902 def end_doctype_decl_handler(self): 0903 s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n') 0904 self.subset = s 0905 raise ParseEscape() 0906 0907 def start_element_handler(self, name, attrs): 0908 raise ParseEscape() 0909 0910 0911 def parse(file, namespaces=1): 0912 """Parse a document, returning the resulting Document node. 0913 0914 'file' may be either a file name or an open file object. 0915 """ 0916 if namespaces: 0917 builder = ExpatBuilderNS() 0918 else: 0919 builder = ExpatBuilder() 0920 0921 if isinstance(file, StringTypes): 0922 fp = open(file, 'rb') 0923 try: 0924 result = builder.parseFile(fp) 0925 finally: 0926 fp.close() 0927 else: 0928 result = builder.parseFile(file) 0929 return result 0930 0931 0932 def parseString(string, namespaces=1): 0933 """Parse a document from a string, returning the resulting 0934 Document node. 0935 """ 0936 if namespaces: 0937 builder = ExpatBuilderNS() 0938 else: 0939 builder = ExpatBuilder() 0940 return builder.parseString(string) 0941 0942 0943 def parseFragment(file, context, namespaces=1): 0944 """Parse a fragment of a document, given the context from which it 0945 was originally extracted. context should be the parent of the 0946 node(s) which are in the fragment. 0947 0948 'file' may be either a file name or an open file object. 0949 """ 0950 if namespaces: 0951 builder = FragmentBuilderNS(context) 0952 else: 0953 builder = FragmentBuilder(context) 0954 0955 if isinstance(file, StringTypes): 0956 fp = open(file, 'rb') 0957 try: 0958 result = builder.parseFile(fp) 0959 finally: 0960 fp.close() 0961 else: 0962 result = builder.parseFile(file) 0963 return result 0964 0965 0966 def parseFragmentString(string, context, namespaces=1): 0967 """Parse a fragment of a document from a string, given the context 0968 from which it was originally extracted. context should be the 0969 parent of the node(s) which are in the fragment. 0970 """ 0971 if namespaces: 0972 builder = FragmentBuilderNS(context) 0973 else: 0974 builder = FragmentBuilder(context) 0975 return builder.parseString(string) 0976 0977 0978 def makeBuilder(options): 0979 """Create a builder based on an Options object.""" 0980 if options.namespaces: 0981 return ExpatBuilderNS(options) 0982 else: 0983 return ExpatBuilder(options) 0984
Generated by PyXR 0.9.4