0001 import xml.sax 0002 import xml.sax.handler 0003 import types 0004 0005 try: 0006 _StringTypes = [types.StringType, types.UnicodeType] 0007 except AttributeError: 0008 _StringTypes = [types.StringType] 0009 0010 START_ELEMENT = "START_ELEMENT" 0011 END_ELEMENT = "END_ELEMENT" 0012 COMMENT = "COMMENT" 0013 START_DOCUMENT = "START_DOCUMENT" 0014 END_DOCUMENT = "END_DOCUMENT" 0015 PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION" 0016 IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE" 0017 CHARACTERS = "CHARACTERS" 0018 0019 class PullDOM(xml.sax.ContentHandler): 0020 _locator = None 0021 document = None 0022 0023 def __init__(self, documentFactory=None): 0024 from xml.dom import XML_NAMESPACE 0025 self.documentFactory = documentFactory 0026 self.firstEvent = [None, None] 0027 self.lastEvent = self.firstEvent 0028 self.elementStack = [] 0029 self.push = self.elementStack.append 0030 try: 0031 self.pop = self.elementStack.pop 0032 except AttributeError: 0033 # use class' pop instead 0034 pass 0035 self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts 0036 self._current_context = self._ns_contexts[-1] 0037 self.pending_events = [] 0038 0039 def pop(self): 0040 result = self.elementStack[-1] 0041 del self.elementStack[-1] 0042 return result 0043 0044 def setDocumentLocator(self, locator): 0045 self._locator = locator 0046 0047 def startPrefixMapping(self, prefix, uri): 0048 if not hasattr(self, '_xmlns_attrs'): 0049 self._xmlns_attrs = [] 0050 self._xmlns_attrs.append((prefix or 'xmlns', uri)) 0051 self._ns_contexts.append(self._current_context.copy()) 0052 self._current_context[uri] = prefix or None 0053 0054 def endPrefixMapping(self, prefix): 0055 self._current_context = self._ns_contexts.pop() 0056 0057 def startElementNS(self, name, tagName , attrs): 0058 # Retrieve xml namespace declaration attributes. 0059 xmlns_uri = 'http://www.w3.org/2000/xmlns/' 0060 xmlns_attrs = getattr(self, '_xmlns_attrs', None) 0061 if xmlns_attrs is not None: 0062 for aname, value in xmlns_attrs: 0063 attrs._attrs[(xmlns_uri, aname)] = value 0064 self._xmlns_attrs = [] 0065 uri, localname = name 0066 if uri: 0067 # When using namespaces, the reader may or may not 0068 # provide us with the original name. If not, create 0069 # *a* valid tagName from the current context. 0070 if tagName is None: 0071 prefix = self._current_context[uri] 0072 if prefix: 0073 tagName = prefix + ":" + localname 0074 else: 0075 tagName = localname 0076 if self.document: 0077 node = self.document.createElementNS(uri, tagName) 0078 else: 0079 node = self.buildDocument(uri, tagName) 0080 else: 0081 # When the tagname is not prefixed, it just appears as 0082 # localname 0083 if self.document: 0084 node = self.document.createElement(localname) 0085 else: 0086 node = self.buildDocument(None, localname) 0087 0088 for aname,value in attrs.items(): 0089 a_uri, a_localname = aname 0090 if a_uri == xmlns_uri: 0091 if a_localname == 'xmlns': 0092 qname = a_localname 0093 else: 0094 qname = 'xmlns:' + a_localname 0095 attr = self.document.createAttributeNS(a_uri, qname) 0096 node.setAttributeNodeNS(attr) 0097 elif a_uri: 0098 prefix = self._current_context[a_uri] 0099 if prefix: 0100 qname = prefix + ":" + a_localname 0101 else: 0102 qname = a_localname 0103 attr = self.document.createAttributeNS(a_uri, qname) 0104 node.setAttributeNodeNS(attr) 0105 else: 0106 attr = self.document.createAttribute(a_localname) 0107 node.setAttributeNode(attr) 0108 attr.value = value 0109 0110 self.lastEvent[1] = [(START_ELEMENT, node), None] 0111 self.lastEvent = self.lastEvent[1] 0112 self.push(node) 0113 0114 def endElementNS(self, name, tagName): 0115 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] 0116 self.lastEvent = self.lastEvent[1] 0117 0118 def startElement(self, name, attrs): 0119 if self.document: 0120 node = self.document.createElement(name) 0121 else: 0122 node = self.buildDocument(None, name) 0123 0124 for aname,value in attrs.items(): 0125 attr = self.document.createAttribute(aname) 0126 attr.value = value 0127 node.setAttributeNode(attr) 0128 0129 self.lastEvent[1] = [(START_ELEMENT, node), None] 0130 self.lastEvent = self.lastEvent[1] 0131 self.push(node) 0132 0133 def endElement(self, name): 0134 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] 0135 self.lastEvent = self.lastEvent[1] 0136 0137 def comment(self, s): 0138 if self.document: 0139 node = self.document.createComment(s) 0140 self.lastEvent[1] = [(COMMENT, node), None] 0141 self.lastEvent = self.lastEvent[1] 0142 else: 0143 event = [(COMMENT, s), None] 0144 self.pending_events.append(event) 0145 0146 def processingInstruction(self, target, data): 0147 if self.document: 0148 node = self.document.createProcessingInstruction(target, data) 0149 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None] 0150 self.lastEvent = self.lastEvent[1] 0151 else: 0152 event = [(PROCESSING_INSTRUCTION, target, data), None] 0153 self.pending_events.append(event) 0154 0155 def ignorableWhitespace(self, chars): 0156 node = self.document.createTextNode(chars) 0157 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None] 0158 self.lastEvent = self.lastEvent[1] 0159 0160 def characters(self, chars): 0161 node = self.document.createTextNode(chars) 0162 self.lastEvent[1] = [(CHARACTERS, node), None] 0163 self.lastEvent = self.lastEvent[1] 0164 0165 def startDocument(self): 0166 if self.documentFactory is None: 0167 import xml.dom.minidom 0168 self.documentFactory = xml.dom.minidom.Document.implementation 0169 0170 def buildDocument(self, uri, tagname): 0171 # Can't do that in startDocument, since we need the tagname 0172 # XXX: obtain DocumentType 0173 node = self.documentFactory.createDocument(uri, tagname, None) 0174 self.document = node 0175 self.lastEvent[1] = [(START_DOCUMENT, node), None] 0176 self.lastEvent = self.lastEvent[1] 0177 self.push(node) 0178 # Put everything we have seen so far into the document 0179 for e in self.pending_events: 0180 if e[0][0] == PROCESSING_INSTRUCTION: 0181 _,target,data = e[0] 0182 n = self.document.createProcessingInstruction(target, data) 0183 e[0] = (PROCESSING_INSTRUCTION, n) 0184 elif e[0][0] == COMMENT: 0185 n = self.document.createComment(e[0][1]) 0186 e[0] = (COMMENT, n) 0187 else: 0188 raise AssertionError("Unknown pending event ",e[0][0]) 0189 self.lastEvent[1] = e 0190 self.lastEvent = e 0191 self.pending_events = None 0192 return node.firstChild 0193 0194 def endDocument(self): 0195 self.lastEvent[1] = [(END_DOCUMENT, self.document), None] 0196 self.pop() 0197 0198 def clear(self): 0199 "clear(): Explicitly release parsing structures" 0200 self.document = None 0201 0202 class ErrorHandler: 0203 def warning(self, exception): 0204 print exception 0205 def error(self, exception): 0206 raise exception 0207 def fatalError(self, exception): 0208 raise exception 0209 0210 class DOMEventStream: 0211 def __init__(self, stream, parser, bufsize): 0212 self.stream = stream 0213 self.parser = parser 0214 self.bufsize = bufsize 0215 if not hasattr(self.parser, 'feed'): 0216 self.getEvent = self._slurp 0217 self.reset() 0218 0219 def reset(self): 0220 self.pulldom = PullDOM() 0221 # This content handler relies on namespace support 0222 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1) 0223 self.parser.setContentHandler(self.pulldom) 0224 0225 def __getitem__(self, pos): 0226 rc = self.getEvent() 0227 if rc: 0228 return rc 0229 raise IndexError 0230 0231 def next(self): 0232 rc = self.getEvent() 0233 if rc: 0234 return rc 0235 raise StopIteration 0236 0237 def __iter__(self): 0238 return self 0239 0240 def expandNode(self, node): 0241 event = self.getEvent() 0242 parents = [node] 0243 while event: 0244 token, cur_node = event 0245 if cur_node is node: 0246 return 0247 if token != END_ELEMENT: 0248 parents[-1].appendChild(cur_node) 0249 if token == START_ELEMENT: 0250 parents.append(cur_node) 0251 elif token == END_ELEMENT: 0252 del parents[-1] 0253 event = self.getEvent() 0254 0255 def getEvent(self): 0256 # use IncrementalParser interface, so we get the desired 0257 # pull effect 0258 if not self.pulldom.firstEvent[1]: 0259 self.pulldom.lastEvent = self.pulldom.firstEvent 0260 while not self.pulldom.firstEvent[1]: 0261 buf = self.stream.read(self.bufsize) 0262 if not buf: 0263 self.parser.close() 0264 return None 0265 self.parser.feed(buf) 0266 rc = self.pulldom.firstEvent[1][0] 0267 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] 0268 return rc 0269 0270 def _slurp(self): 0271 """ Fallback replacement for getEvent() using the 0272 standard SAX2 interface, which means we slurp the 0273 SAX events into memory (no performance gain, but 0274 we are compatible to all SAX parsers). 0275 """ 0276 self.parser.parse(self.stream) 0277 self.getEvent = self._emit 0278 return self._emit() 0279 0280 def _emit(self): 0281 """ Fallback replacement for getEvent() that emits 0282 the events that _slurp() read previously. 0283 """ 0284 rc = self.pulldom.firstEvent[1][0] 0285 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] 0286 return rc 0287 0288 def clear(self): 0289 """clear(): Explicitly release parsing objects""" 0290 self.pulldom.clear() 0291 del self.pulldom 0292 self.parser = None 0293 self.stream = None 0294 0295 class SAX2DOM(PullDOM): 0296 0297 def startElementNS(self, name, tagName , attrs): 0298 PullDOM.startElementNS(self, name, tagName, attrs) 0299 curNode = self.elementStack[-1] 0300 parentNode = self.elementStack[-2] 0301 parentNode.appendChild(curNode) 0302 0303 def startElement(self, name, attrs): 0304 PullDOM.startElement(self, name, attrs) 0305 curNode = self.elementStack[-1] 0306 parentNode = self.elementStack[-2] 0307 parentNode.appendChild(curNode) 0308 0309 def processingInstruction(self, target, data): 0310 PullDOM.processingInstruction(self, target, data) 0311 node = self.lastEvent[0][1] 0312 parentNode = self.elementStack[-1] 0313 parentNode.appendChild(node) 0314 0315 def ignorableWhitespace(self, chars): 0316 PullDOM.ignorableWhitespace(self, chars) 0317 node = self.lastEvent[0][1] 0318 parentNode = self.elementStack[-1] 0319 parentNode.appendChild(node) 0320 0321 def characters(self, chars): 0322 PullDOM.characters(self, chars) 0323 node = self.lastEvent[0][1] 0324 parentNode = self.elementStack[-1] 0325 parentNode.appendChild(node) 0326 0327 0328 default_bufsize = (2 ** 14) - 20 0329 0330 def parse(stream_or_string, parser=None, bufsize=None): 0331 if bufsize is None: 0332 bufsize = default_bufsize 0333 if type(stream_or_string) in _StringTypes: 0334 stream = open(stream_or_string) 0335 else: 0336 stream = stream_or_string 0337 if not parser: 0338 parser = xml.sax.make_parser() 0339 return DOMEventStream(stream, parser, bufsize) 0340 0341 def parseString(string, parser=None): 0342 try: 0343 from cStringIO import StringIO 0344 except ImportError: 0345 from StringIO import StringIO 0346 0347 bufsize = len(string) 0348 buf = StringIO(string) 0349 if not parser: 0350 parser = xml.sax.make_parser() 0351 return DOMEventStream(buf, parser, bufsize) 0352
Generated by PyXR 0.9.4