PyXR

c:\python24\lib \ xmllib.py



0001 """A parser for XML, using the derived class as static DTD."""
0002 
0003 # Author: Sjoerd Mullender.
0004 
0005 import re
0006 import string
0007 
0008 import warnings
0009 warnings.warn("The xmllib module is obsolete.  Use xml.sax instead.", DeprecationWarning)
0010 del warnings
0011 
0012 version = '0.3'
0013 
0014 class Error(RuntimeError):
0015     pass
0016 
0017 # Regular expressions used for parsing
0018 
0019 _S = '[ \t\r\n]+'                       # white space
0020 _opS = '[ \t\r\n]*'                     # optional white space
0021 _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'    # valid XML name
0022 _QStr = "(?:'[^']*'|\"[^\"]*\")"        # quoted XML string
0023 illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
0024 interesting = re.compile('[]&<]')
0025 
0026 amp = re.compile('&')
0027 ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
0028 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
0029 charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
0030 space = re.compile(_S + '$')
0031 newline = re.compile('\n')
0032 
0033 attrfind = re.compile(
0034     _S + '(?P<name>' + _Name + ')'
0035     '(' + _opS + '=' + _opS +
0036     '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
0037 starttagopen = re.compile('<' + _Name)
0038 starttagend = re.compile(_opS + '(?P<slash>/?)>')
0039 starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
0040                       '(?P<attrs>(?:'+attrfind.pattern+')*)'+
0041                       starttagend.pattern)
0042 endtagopen = re.compile('</')
0043 endbracket = re.compile(_opS + '>')
0044 endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
0045 tagfind = re.compile(_Name)
0046 cdataopen = re.compile(r'<!\[CDATA\[')
0047 cdataclose = re.compile(r'\]\]>')
0048 # this matches one of the following:
0049 # SYSTEM SystemLiteral
0050 # PUBLIC PubidLiteral SystemLiteral
0051 _SystemLiteral = '(?P<%s>'+_QStr+')'
0052 _PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
0053                         "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
0054 _ExternalId = '(?:SYSTEM|' \
0055                  'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
0056               ')'+_S+_SystemLiteral%'syslit'
0057 doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
0058                      '(?:'+_S+_ExternalId+')?'+_opS)
0059 xmldecl = re.compile('<\?xml'+_S+
0060                      'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
0061                      '(?:'+_S+'encoding'+_opS+'='+_opS+
0062                         "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
0063                         '"[A-Za-z][-A-Za-z0-9._]*"))?'
0064                      '(?:'+_S+'standalone'+_opS+'='+_opS+
0065                         '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
0066                      _opS+'\?>')
0067 procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
0068 procclose = re.compile(_opS + r'\?>')
0069 commentopen = re.compile('<!--')
0070 commentclose = re.compile('-->')
0071 doubledash = re.compile('--')
0072 attrtrans = string.maketrans(' \r\n\t', '    ')
0073 
0074 # definitions for XML namespaces
0075 _NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"
0076 ncname = re.compile(_NCName + '$')
0077 qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
0078                    '(?P<local>' + _NCName + ')$')
0079 
0080 xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
0081 
0082 # XML parser base class -- find tags and call handler functions.
0083 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
0084 # The dtd is defined by deriving a class which defines methods with
0085 # special names to handle tags: start_foo and end_foo to handle <foo>
0086 # and </foo>, respectively.  The data between tags is passed to the
0087 # parser by calling self.handle_data() with some data as argument (the
0088 # data may be split up in arbitrary chunks).
0089 
0090 class XMLParser:
0091     attributes = {}                     # default, to be overridden
0092     elements = {}                       # default, to be overridden
0093 
0094     # parsing options, settable using keyword args in __init__
0095     __accept_unquoted_attributes = 0
0096     __accept_missing_endtag_name = 0
0097     __map_case = 0
0098     __accept_utf8 = 0
0099     __translate_attribute_references = 1
0100 
0101     # Interface -- initialize and reset this instance
0102     def __init__(self, **kw):
0103         self.__fixed = 0
0104         if 'accept_unquoted_attributes' in kw:
0105             self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
0106         if 'accept_missing_endtag_name' in kw:
0107             self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
0108         if 'map_case' in kw:
0109             self.__map_case = kw['map_case']
0110         if 'accept_utf8' in kw:
0111             self.__accept_utf8 = kw['accept_utf8']
0112         if 'translate_attribute_references' in kw:
0113             self.__translate_attribute_references = kw['translate_attribute_references']
0114         self.reset()
0115 
0116     def __fixelements(self):
0117         self.__fixed = 1
0118         self.elements = {}
0119         self.__fixdict(self.__dict__)
0120         self.__fixclass(self.__class__)
0121 
0122     def __fixclass(self, kl):
0123         self.__fixdict(kl.__dict__)
0124         for k in kl.__bases__:
0125             self.__fixclass(k)
0126 
0127     def __fixdict(self, dict):
0128         for key in dict.keys():
0129             if key[:6] == 'start_':
0130                 tag = key[6:]
0131                 start, end = self.elements.get(tag, (None, None))
0132                 if start is None:
0133                     self.elements[tag] = getattr(self, key), end
0134             elif key[:4] == 'end_':
0135                 tag = key[4:]
0136                 start, end = self.elements.get(tag, (None, None))
0137                 if end is None:
0138                     self.elements[tag] = start, getattr(self, key)
0139 
0140     # Interface -- reset this instance.  Loses all unprocessed data
0141     def reset(self):
0142         self.rawdata = ''
0143         self.stack = []
0144         self.nomoretags = 0
0145         self.literal = 0
0146         self.lineno = 1
0147         self.__at_start = 1
0148         self.__seen_doctype = None
0149         self.__seen_starttag = 0
0150         self.__use_namespaces = 0
0151         self.__namespaces = {'xml':None}   # xml is implicitly declared
0152         # backward compatibility hack: if elements not overridden,
0153         # fill it in ourselves
0154         if self.elements is XMLParser.elements:
0155             self.__fixelements()
0156 
0157     # For derived classes only -- enter literal mode (CDATA) till EOF
0158     def setnomoretags(self):
0159         self.nomoretags = self.literal = 1
0160 
0161     # For derived classes only -- enter literal mode (CDATA)
0162     def setliteral(self, *args):
0163         self.literal = 1
0164 
0165     # Interface -- feed some data to the parser.  Call this as
0166     # often as you want, with as little or as much text as you
0167     # want (may include '\n').  (This just saves the text, all the
0168     # processing is done by goahead().)
0169     def feed(self, data):
0170         self.rawdata = self.rawdata + data
0171         self.goahead(0)
0172 
0173     # Interface -- handle the remaining data
0174     def close(self):
0175         self.goahead(1)
0176         if self.__fixed:
0177             self.__fixed = 0
0178             # remove self.elements so that we don't leak
0179             del self.elements
0180 
0181     # Interface -- translate references
0182     def translate_references(self, data, all = 1):
0183         if not self.__translate_attribute_references:
0184             return data
0185         i = 0
0186         while 1:
0187             res = amp.search(data, i)
0188             if res is None:
0189                 return data
0190             s = res.start(0)
0191             res = ref.match(data, s)
0192             if res is None:
0193                 self.syntax_error("bogus `&'")
0194                 i = s+1
0195                 continue
0196             i = res.end(0)
0197             str = res.group(1)
0198             rescan = 0
0199             if str[0] == '#':
0200                 if str[1] == 'x':
0201                     str = chr(int(str[2:], 16))
0202                 else:
0203                     str = chr(int(str[1:]))
0204                 if data[i - 1] != ';':
0205                     self.syntax_error("`;' missing after char reference")
0206                     i = i-1
0207             elif all:
0208                 if str in self.entitydefs:
0209                     str = self.entitydefs[str]
0210                     rescan = 1
0211                 elif data[i - 1] != ';':
0212                     self.syntax_error("bogus `&'")
0213                     i = s + 1 # just past the &
0214                     continue
0215                 else:
0216                     self.syntax_error("reference to unknown entity `&%s;'" % str)
0217                     str = '&' + str + ';'
0218             elif data[i - 1] != ';':
0219                 self.syntax_error("bogus `&'")
0220                 i = s + 1 # just past the &
0221                 continue
0222 
0223             # when we get here, str contains the translated text and i points
0224             # to the end of the string that is to be replaced
0225             data = data[:s] + str + data[i:]
0226             if rescan:
0227                 i = s
0228             else:
0229                 i = s + len(str)
0230 
0231     # Interface - return a dictionary of all namespaces currently valid
0232     def getnamespace(self):
0233         nsdict = {}
0234         for t, d, nst in self.stack:
0235             nsdict.update(d)
0236         return nsdict
0237 
0238     # Internal -- handle data as far as reasonable.  May leave state
0239     # and data to be processed by a subsequent call.  If 'end' is
0240     # true, force handling all data as if followed by EOF marker.
0241     def goahead(self, end):
0242         rawdata = self.rawdata
0243         i = 0
0244         n = len(rawdata)
0245         while i < n:
0246             if i > 0:
0247                 self.__at_start = 0
0248             if self.nomoretags:
0249                 data = rawdata[i:n]
0250                 self.handle_data(data)
0251                 self.lineno = self.lineno + data.count('\n')
0252                 i = n
0253                 break
0254             res = interesting.search(rawdata, i)
0255             if res:
0256                 j = res.start(0)
0257             else:
0258                 j = n
0259             if i < j:
0260                 data = rawdata[i:j]
0261                 if self.__at_start and space.match(data) is None:
0262                     self.syntax_error('illegal data at start of file')
0263                 self.__at_start = 0
0264                 if not self.stack and space.match(data) is None:
0265                     self.syntax_error('data not in content')
0266                 if not self.__accept_utf8 and illegal.search(data):
0267                     self.syntax_error('illegal character in content')
0268                 self.handle_data(data)
0269                 self.lineno = self.lineno + data.count('\n')
0270             i = j
0271             if i == n: break
0272             if rawdata[i] == '<':
0273                 if starttagopen.match(rawdata, i):
0274                     if self.literal:
0275                         data = rawdata[i]
0276                         self.handle_data(data)
0277                         self.lineno = self.lineno + data.count('\n')
0278                         i = i+1
0279                         continue
0280                     k = self.parse_starttag(i)
0281                     if k < 0: break
0282                     self.__seen_starttag = 1
0283                     self.lineno = self.lineno + rawdata[i:k].count('\n')
0284                     i = k
0285                     continue
0286                 if endtagopen.match(rawdata, i):
0287                     k = self.parse_endtag(i)
0288                     if k < 0: break
0289                     self.lineno = self.lineno + rawdata[i:k].count('\n')
0290                     i =  k
0291                     continue
0292                 if commentopen.match(rawdata, i):
0293                     if self.literal:
0294                         data = rawdata[i]
0295                         self.handle_data(data)
0296                         self.lineno = self.lineno + data.count('\n')
0297                         i = i+1
0298                         continue
0299                     k = self.parse_comment(i)
0300                     if k < 0: break
0301                     self.lineno = self.lineno + rawdata[i:k].count('\n')
0302                     i = k
0303                     continue
0304                 if cdataopen.match(rawdata, i):
0305                     k = self.parse_cdata(i)
0306                     if k < 0: break
0307                     self.lineno = self.lineno + rawdata[i:k].count('\n')
0308                     i = k
0309                     continue
0310                 res = xmldecl.match(rawdata, i)
0311                 if res:
0312                     if not self.__at_start:
0313                         self.syntax_error("<?xml?> declaration not at start of document")
0314                     version, encoding, standalone = res.group('version',
0315                                                               'encoding',
0316                                                               'standalone')
0317                     if version[1:-1] != '1.0':
0318                         raise Error('only XML version 1.0 supported')
0319                     if encoding: encoding = encoding[1:-1]
0320                     if standalone: standalone = standalone[1:-1]
0321                     self.handle_xml(encoding, standalone)
0322                     i = res.end(0)
0323                     continue
0324                 res = procopen.match(rawdata, i)
0325                 if res:
0326                     k = self.parse_proc(i)
0327                     if k < 0: break
0328                     self.lineno = self.lineno + rawdata[i:k].count('\n')
0329                     i = k
0330                     continue
0331                 res = doctype.match(rawdata, i)
0332                 if res:
0333                     if self.literal:
0334                         data = rawdata[i]
0335                         self.handle_data(data)
0336                         self.lineno = self.lineno + data.count('\n')
0337                         i = i+1
0338                         continue
0339                     if self.__seen_doctype:
0340                         self.syntax_error('multiple DOCTYPE elements')
0341                     if self.__seen_starttag:
0342                         self.syntax_error('DOCTYPE not at beginning of document')
0343                     k = self.parse_doctype(res)
0344                     if k < 0: break
0345                     self.__seen_doctype = res.group('name')
0346                     if self.__map_case:
0347                         self.__seen_doctype = self.__seen_doctype.lower()
0348                     self.lineno = self.lineno + rawdata[i:k].count('\n')
0349                     i = k
0350                     continue
0351             elif rawdata[i] == '&':
0352                 if self.literal:
0353                     data = rawdata[i]
0354                     self.handle_data(data)
0355                     i = i+1
0356                     continue
0357                 res = charref.match(rawdata, i)
0358                 if res is not None:
0359                     i = res.end(0)
0360                     if rawdata[i-1] != ';':
0361                         self.syntax_error("`;' missing in charref")
0362                         i = i-1
0363                     if not self.stack:
0364                         self.syntax_error('data not in content')
0365                     self.handle_charref(res.group('char')[:-1])
0366                     self.lineno = self.lineno + res.group(0).count('\n')
0367                     continue
0368                 res = entityref.match(rawdata, i)
0369                 if res is not None:
0370                     i = res.end(0)
0371                     if rawdata[i-1] != ';':
0372                         self.syntax_error("`;' missing in entityref")
0373                         i = i-1
0374                     name = res.group('name')
0375                     if self.__map_case:
0376                         name = name.lower()
0377                     if name in self.entitydefs:
0378                         self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
0379                         n = len(rawdata)
0380                         i = res.start(0)
0381                     else:
0382                         self.unknown_entityref(name)
0383                     self.lineno = self.lineno + res.group(0).count('\n')
0384                     continue
0385             elif rawdata[i] == ']':
0386                 if self.literal:
0387                     data = rawdata[i]
0388                     self.handle_data(data)
0389                     i = i+1
0390                     continue
0391                 if n-i < 3:
0392                     break
0393                 if cdataclose.match(rawdata, i):
0394                     self.syntax_error("bogus `]]>'")
0395                 self.handle_data(rawdata[i])
0396                 i = i+1
0397                 continue
0398             else:
0399                 raise Error('neither < nor & ??')
0400             # We get here only if incomplete matches but
0401             # nothing else
0402             break
0403         # end while
0404         if i > 0:
0405             self.__at_start = 0
0406         if end and i < n:
0407             data = rawdata[i]
0408             self.syntax_error("bogus `%s'" % data)
0409             if not self.__accept_utf8 and illegal.search(data):
0410                 self.syntax_error('illegal character in content')
0411             self.handle_data(data)
0412             self.lineno = self.lineno + data.count('\n')
0413             self.rawdata = rawdata[i+1:]
0414             return self.goahead(end)
0415         self.rawdata = rawdata[i:]
0416         if end:
0417             if not self.__seen_starttag:
0418                 self.syntax_error('no elements in file')
0419             if self.stack:
0420                 self.syntax_error('missing end tags')
0421                 while self.stack:
0422                     self.finish_endtag(self.stack[-1][0])
0423 
0424     # Internal -- parse comment, return length or -1 if not terminated
0425     def parse_comment(self, i):
0426         rawdata = self.rawdata
0427         if rawdata[i:i+4] != '<!--':
0428             raise Error('unexpected call to handle_comment')
0429         res = commentclose.search(rawdata, i+4)
0430         if res is None:
0431             return -1
0432         if doubledash.search(rawdata, i+4, res.start(0)):
0433             self.syntax_error("`--' inside comment")
0434         if rawdata[res.start(0)-1] == '-':
0435             self.syntax_error('comment cannot end in three dashes')
0436         if not self.__accept_utf8 and \
0437            illegal.search(rawdata, i+4, res.start(0)):
0438             self.syntax_error('illegal character in comment')
0439         self.handle_comment(rawdata[i+4: res.start(0)])
0440         return res.end(0)
0441 
0442     # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
0443     def parse_doctype(self, res):
0444         rawdata = self.rawdata
0445         n = len(rawdata)
0446         name = res.group('name')
0447         if self.__map_case:
0448             name = name.lower()
0449         pubid, syslit = res.group('pubid', 'syslit')
0450         if pubid is not None:
0451             pubid = pubid[1:-1]         # remove quotes
0452             pubid = ' '.join(pubid.split()) # normalize
0453         if syslit is not None: syslit = syslit[1:-1] # remove quotes
0454         j = k = res.end(0)
0455         if k >= n:
0456             return -1
0457         if rawdata[k] == '[':
0458             level = 0
0459             k = k+1
0460             dq = sq = 0
0461             while k < n:
0462                 c = rawdata[k]
0463                 if not sq and c == '"':
0464                     dq = not dq
0465                 elif not dq and c == "'":
0466                     sq = not sq
0467                 elif sq or dq:
0468                     pass
0469                 elif level <= 0 and c == ']':
0470                     res = endbracket.match(rawdata, k+1)
0471                     if res is None:
0472                         return -1
0473                     self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
0474                     return res.end(0)
0475                 elif c == '<':
0476                     level = level + 1
0477                 elif c == '>':
0478                     level = level - 1
0479                     if level < 0:
0480                         self.syntax_error("bogus `>' in DOCTYPE")
0481                 k = k+1
0482         res = endbracketfind.match(rawdata, k)
0483         if res is None:
0484             return -1
0485         if endbracket.match(rawdata, k) is None:
0486             self.syntax_error('garbage in DOCTYPE')
0487         self.handle_doctype(name, pubid, syslit, None)
0488         return res.end(0)
0489 
0490     # Internal -- handle CDATA tag, return length or -1 if not terminated
0491     def parse_cdata(self, i):
0492         rawdata = self.rawdata
0493         if rawdata[i:i+9] != '<![CDATA[':
0494             raise Error('unexpected call to parse_cdata')
0495         res = cdataclose.search(rawdata, i+9)
0496         if res is None:
0497             return -1
0498         if not self.__accept_utf8 and \
0499            illegal.search(rawdata, i+9, res.start(0)):
0500             self.syntax_error('illegal character in CDATA')
0501         if not self.stack:
0502             self.syntax_error('CDATA not in content')
0503         self.handle_cdata(rawdata[i+9:res.start(0)])
0504         return res.end(0)
0505 
0506     __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
0507     # Internal -- handle a processing instruction tag
0508     def parse_proc(self, i):
0509         rawdata = self.rawdata
0510         end = procclose.search(rawdata, i)
0511         if end is None:
0512             return -1
0513         j = end.start(0)
0514         if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
0515             self.syntax_error('illegal character in processing instruction')
0516         res = tagfind.match(rawdata, i+2)
0517         if res is None:
0518             raise Error('unexpected call to parse_proc')
0519         k = res.end(0)
0520         name = res.group(0)
0521         if self.__map_case:
0522             name = name.lower()
0523         if name == 'xml:namespace':
0524             self.syntax_error('old-fashioned namespace declaration')
0525             self.__use_namespaces = -1
0526             # namespace declaration
0527             # this must come after the <?xml?> declaration (if any)
0528             # and before the <!DOCTYPE> (if any).
0529             if self.__seen_doctype or self.__seen_starttag:
0530                 self.syntax_error('xml:namespace declaration too late in document')
0531             attrdict, namespace, k = self.parse_attributes(name, k, j)
0532             if namespace:
0533                 self.syntax_error('namespace declaration inside namespace declaration')
0534             for attrname in attrdict.keys():
0535                 if not attrname in self.__xml_namespace_attributes:
0536                     self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
0537             if not 'ns' in attrdict or not 'prefix' in attrdict:
0538                 self.syntax_error('xml:namespace without required attributes')
0539             prefix = attrdict.get('prefix')
0540             if ncname.match(prefix) is None:
0541                 self.syntax_error('xml:namespace illegal prefix value')
0542                 return end.end(0)
0543             if prefix in self.__namespaces:
0544                 self.syntax_error('xml:namespace prefix not unique')
0545             self.__namespaces[prefix] = attrdict['ns']
0546         else:
0547             if name.lower() == 'xml':
0548                 self.syntax_error('illegal processing instruction target name')
0549             self.handle_proc(name, rawdata[k:j])
0550         return end.end(0)
0551 
0552     # Internal -- parse attributes between i and j
0553     def parse_attributes(self, tag, i, j):
0554         rawdata = self.rawdata
0555         attrdict = {}
0556         namespace = {}
0557         while i < j:
0558             res = attrfind.match(rawdata, i)
0559             if res is None:
0560                 break
0561             attrname, attrvalue = res.group('name', 'value')
0562             if self.__map_case:
0563                 attrname = attrname.lower()
0564             i = res.end(0)
0565             if attrvalue is None:
0566                 self.syntax_error("no value specified for attribute `%s'" % attrname)
0567                 attrvalue = attrname
0568             elif attrvalue[:1] == "'" == attrvalue[-1:] or \
0569                  attrvalue[:1] == '"' == attrvalue[-1:]:
0570                 attrvalue = attrvalue[1:-1]
0571             elif not self.__accept_unquoted_attributes:
0572                 self.syntax_error("attribute `%s' value not quoted" % attrname)
0573             res = xmlns.match(attrname)
0574             if res is not None:
0575                 # namespace declaration
0576                 ncname = res.group('ncname')
0577                 namespace[ncname or ''] = attrvalue or None
0578                 if not self.__use_namespaces:
0579                     self.__use_namespaces = len(self.stack)+1
0580                 continue
0581             if '<' in attrvalue:
0582                 self.syntax_error("`<' illegal in attribute value")
0583             if attrname in attrdict:
0584                 self.syntax_error("attribute `%s' specified twice" % attrname)
0585             attrvalue = attrvalue.translate(attrtrans)
0586             attrdict[attrname] = self.translate_references(attrvalue)
0587         return attrdict, namespace, i
0588 
0589     # Internal -- handle starttag, return length or -1 if not terminated
0590     def parse_starttag(self, i):
0591         rawdata = self.rawdata
0592         # i points to start of tag
0593         end = endbracketfind.match(rawdata, i+1)
0594         if end is None:
0595             return -1
0596         tag = starttagmatch.match(rawdata, i)
0597         if tag is None or tag.end(0) != end.end(0):
0598             self.syntax_error('garbage in starttag')
0599             return end.end(0)
0600         nstag = tagname = tag.group('tagname')
0601         if self.__map_case:
0602             nstag = tagname = nstag.lower()
0603         if not self.__seen_starttag and self.__seen_doctype and \
0604            tagname != self.__seen_doctype:
0605             self.syntax_error('starttag does not match DOCTYPE')
0606         if self.__seen_starttag and not self.stack:
0607             self.syntax_error('multiple elements on top level')
0608         k, j = tag.span('attrs')
0609         attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
0610         self.stack.append((tagname, nsdict, nstag))
0611         if self.__use_namespaces:
0612             res = qname.match(tagname)
0613         else:
0614             res = None
0615         if res is not None:
0616             prefix, nstag = res.group('prefix', 'local')
0617             if prefix is None:
0618                 prefix = ''
0619             ns = None
0620             for t, d, nst in self.stack:
0621                 if prefix in d:
0622                     ns = d[prefix]
0623             if ns is None and prefix != '':
0624                 ns = self.__namespaces.get(prefix)
0625             if ns is not None:
0626                 nstag = ns + ' ' + nstag
0627             elif prefix != '':
0628                 nstag = prefix + ':' + nstag # undo split
0629             self.stack[-1] = tagname, nsdict, nstag
0630         # translate namespace of attributes
0631         attrnamemap = {} # map from new name to old name (used for error reporting)
0632         for key in attrdict.keys():
0633             attrnamemap[key] = key
0634         if self.__use_namespaces:
0635             nattrdict = {}
0636             for key, val in attrdict.items():
0637                 okey = key
0638                 res = qname.match(key)
0639                 if res is not None:
0640                     aprefix, key = res.group('prefix', 'local')
0641                     if self.__map_case:
0642                         key = key.lower()
0643                     if aprefix is not None:
0644                         ans = None
0645                         for t, d, nst in self.stack:
0646                             if aprefix in d:
0647                                 ans = d[aprefix]
0648                         if ans is None:
0649                             ans = self.__namespaces.get(aprefix)
0650                         if ans is not None:
0651                             key = ans + ' ' + key
0652                         else:
0653                             key = aprefix + ':' + key
0654                 nattrdict[key] = val
0655                 attrnamemap[key] = okey
0656             attrdict = nattrdict
0657         attributes = self.attributes.get(nstag)
0658         if attributes is not None:
0659             for key in attrdict.keys():
0660                 if not key in attributes:
0661                     self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
0662             for key, val in attributes.items():
0663                 if val is not None and not key in attrdict:
0664                     attrdict[key] = val
0665         method = self.elements.get(nstag, (None, None))[0]
0666         self.finish_starttag(nstag, attrdict, method)
0667         if tag.group('slash') == '/':
0668             self.finish_endtag(tagname)
0669         return tag.end(0)
0670 
0671     # Internal -- parse endtag
0672     def parse_endtag(self, i):
0673         rawdata = self.rawdata
0674         end = endbracketfind.match(rawdata, i+1)
0675         if end is None:
0676             return -1
0677         res = tagfind.match(rawdata, i+2)
0678         if res is None:
0679             if self.literal:
0680                 self.handle_data(rawdata[i])
0681                 return i+1
0682             if not self.__accept_missing_endtag_name:
0683                 self.syntax_error('no name specified in end tag')
0684             tag = self.stack[-1][0]
0685             k = i+2
0686         else:
0687             tag = res.group(0)
0688             if self.__map_case:
0689                 tag = tag.lower()
0690             if self.literal:
0691                 if not self.stack or tag != self.stack[-1][0]:
0692                     self.handle_data(rawdata[i])
0693                     return i+1
0694             k = res.end(0)
0695         if endbracket.match(rawdata, k) is None:
0696             self.syntax_error('garbage in end tag')
0697         self.finish_endtag(tag)
0698         return end.end(0)
0699 
0700     # Internal -- finish processing of start tag
0701     def finish_starttag(self, tagname, attrdict, method):
0702         if method is not None:
0703             self.handle_starttag(tagname, method, attrdict)
0704         else:
0705             self.unknown_starttag(tagname, attrdict)
0706 
0707     # Internal -- finish processing of end tag
0708     def finish_endtag(self, tag):
0709         self.literal = 0
0710         if not tag:
0711             self.syntax_error('name-less end tag')
0712             found = len(self.stack) - 1
0713             if found < 0:
0714                 self.unknown_endtag(tag)
0715                 return
0716         else:
0717             found = -1
0718             for i in range(len(self.stack)):
0719                 if tag == self.stack[i][0]:
0720                     found = i
0721             if found == -1:
0722                 self.syntax_error('unopened end tag')
0723                 return
0724         while len(self.stack) > found:
0725             if found < len(self.stack) - 1:
0726                 self.syntax_error('missing close tag for %s' % self.stack[-1][2])
0727             nstag = self.stack[-1][2]
0728             method = self.elements.get(nstag, (None, None))[1]
0729             if method is not None:
0730                 self.handle_endtag(nstag, method)
0731             else:
0732                 self.unknown_endtag(nstag)
0733             if self.__use_namespaces == len(self.stack):
0734                 self.__use_namespaces = 0
0735             del self.stack[-1]
0736 
0737     # Overridable -- handle xml processing instruction
0738     def handle_xml(self, encoding, standalone):
0739         pass
0740 
0741     # Overridable -- handle DOCTYPE
0742     def handle_doctype(self, tag, pubid, syslit, data):
0743         pass
0744 
0745     # Overridable -- handle start tag
0746     def handle_starttag(self, tag, method, attrs):
0747         method(attrs)
0748 
0749     # Overridable -- handle end tag
0750     def handle_endtag(self, tag, method):
0751         method()
0752 
0753     # Example -- handle character reference, no need to override
0754     def handle_charref(self, name):
0755         try:
0756             if name[0] == 'x':
0757                 n = int(name[1:], 16)
0758             else:
0759                 n = int(name)
0760         except ValueError:
0761             self.unknown_charref(name)
0762             return
0763         if not 0 <= n <= 255:
0764             self.unknown_charref(name)
0765             return
0766         self.handle_data(chr(n))
0767 
0768     # Definition of entities -- derived classes may override
0769     entitydefs = {'lt': '&#60;',        # must use charref
0770                   'gt': '&#62;',
0771                   'amp': '&#38;',       # must use charref
0772                   'quot': '&#34;',
0773                   'apos': '&#39;',
0774                   }
0775 
0776     # Example -- handle data, should be overridden
0777     def handle_data(self, data):
0778         pass
0779 
0780     # Example -- handle cdata, could be overridden
0781     def handle_cdata(self, data):
0782         pass
0783 
0784     # Example -- handle comment, could be overridden
0785     def handle_comment(self, data):
0786         pass
0787 
0788     # Example -- handle processing instructions, could be overridden
0789     def handle_proc(self, name, data):
0790         pass
0791 
0792     # Example -- handle relatively harmless syntax errors, could be overridden
0793     def syntax_error(self, message):
0794         raise Error('Syntax error at line %d: %s' % (self.lineno, message))
0795 
0796     # To be overridden -- handlers for unknown objects
0797     def unknown_starttag(self, tag, attrs): pass
0798     def unknown_endtag(self, tag): pass
0799     def unknown_charref(self, ref): pass
0800     def unknown_entityref(self, name):
0801         self.syntax_error("reference to unknown entity `&%s;'" % name)
0802 
0803 
0804 class TestXMLParser(XMLParser):
0805 
0806     def __init__(self, **kw):
0807         self.testdata = ""
0808         XMLParser.__init__(self, **kw)
0809 
0810     def handle_xml(self, encoding, standalone):
0811         self.flush()
0812         print 'xml: encoding =',encoding,'standalone =',standalone
0813 
0814     def handle_doctype(self, tag, pubid, syslit, data):
0815         self.flush()
0816         print 'DOCTYPE:',tag, repr(data)
0817 
0818     def handle_data(self, data):
0819         self.testdata = self.testdata + data
0820         if len(repr(self.testdata)) >= 70:
0821             self.flush()
0822 
0823     def flush(self):
0824         data = self.testdata
0825         if data:
0826             self.testdata = ""
0827             print 'data:', repr(data)
0828 
0829     def handle_cdata(self, data):
0830         self.flush()
0831         print 'cdata:', repr(data)
0832 
0833     def handle_proc(self, name, data):
0834         self.flush()
0835         print 'processing:',name,repr(data)
0836 
0837     def handle_comment(self, data):
0838         self.flush()
0839         r = repr(data)
0840         if len(r) > 68:
0841             r = r[:32] + '...' + r[-32:]
0842         print 'comment:', r
0843 
0844     def syntax_error(self, message):
0845         print 'error at line %d:' % self.lineno, message
0846 
0847     def unknown_starttag(self, tag, attrs):
0848         self.flush()
0849         if not attrs:
0850             print 'start tag: <' + tag + '>'
0851         else:
0852             print 'start tag: <' + tag,
0853             for name, value in attrs.items():
0854                 print name + '=' + '"' + value + '"',
0855             print '>'
0856 
0857     def unknown_endtag(self, tag):
0858         self.flush()
0859         print 'end tag: </' + tag + '>'
0860 
0861     def unknown_entityref(self, ref):
0862         self.flush()
0863         print '*** unknown entity ref: &' + ref + ';'
0864 
0865     def unknown_charref(self, ref):
0866         self.flush()
0867         print '*** unknown char ref: &#' + ref + ';'
0868 
0869     def close(self):
0870         XMLParser.close(self)
0871         self.flush()
0872 
0873 def test(args = None):
0874     import sys, getopt
0875     from time import time
0876 
0877     if not args:
0878         args = sys.argv[1:]
0879 
0880     opts, args = getopt.getopt(args, 'st')
0881     klass = TestXMLParser
0882     do_time = 0
0883     for o, a in opts:
0884         if o == '-s':
0885             klass = XMLParser
0886         elif o == '-t':
0887             do_time = 1
0888 
0889     if args:
0890         file = args[0]
0891     else:
0892         file = 'test.xml'
0893 
0894     if file == '-':
0895         f = sys.stdin
0896     else:
0897         try:
0898             f = open(file, 'r')
0899         except IOError, msg:
0900             print file, ":", msg
0901             sys.exit(1)
0902 
0903     data = f.read()
0904     if f is not sys.stdin:
0905         f.close()
0906 
0907     x = klass()
0908     t0 = time()
0909     try:
0910         if do_time:
0911             x.feed(data)
0912             x.close()
0913         else:
0914             for c in data:
0915                 x.feed(c)
0916             x.close()
0917     except Error, msg:
0918         t1 = time()
0919         print msg
0920         if do_time:
0921             print 'total time: %g' % (t1-t0)
0922         sys.exit(1)
0923     t1 = time()
0924     if do_time:
0925         print 'total time: %g' % (t1-t0)
0926 
0927 
0928 if __name__ == '__main__':
0929     test()
0930 

Generated by PyXR 0.9.4
SourceForge.net Logo