PyXR

c:\python24\lib \ markupbase.py


0001 """Shared support for scanning document type declarations in HTML and XHTML."""
0002 
0003 import re
0004 
0005 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
0006 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
0007 _commentclose = re.compile(r'--\s*>')
0008 _markedsectionclose = re.compile(r']\s*]\s*>')
0009 
0010 # An analysis of the MS-Word extensions is available at
0011 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
0012 
0013 _msmarkedsectionclose = re.compile(r']\s*>')
0014 
0015 del re
0016 
0017 
0018 class ParserBase:
0019     """Parser base class which provides some common support methods used
0020     by the SGML/HTML and XHTML parsers."""
0021 
0022     def __init__(self):
0023         if self.__class__ is ParserBase:
0024             raise RuntimeError(
0025                 "markupbase.ParserBase must be subclassed")
0026 
0027     def error(self, message):
0028         raise NotImplementedError(
0029             "subclasses of ParserBase must override error()")
0030 
0031     def reset(self):
0032         self.lineno = 1
0033         self.offset = 0
0034 
0035     def getpos(self):
0036         """Return current line number and offset."""
0037         return self.lineno, self.offset
0038 
0039     # Internal -- update line number and offset.  This should be
0040     # called for each piece of data exactly once, in order -- in other
0041     # words the concatenation of all the input strings to this
0042     # function should be exactly the entire input.
0043     def updatepos(self, i, j):
0044         if i >= j:
0045             return j
0046         rawdata = self.rawdata
0047         nlines = rawdata.count("\n", i, j)
0048         if nlines:
0049             self.lineno = self.lineno + nlines
0050             pos = rawdata.rindex("\n", i, j) # Should not fail
0051             self.offset = j-(pos+1)
0052         else:
0053             self.offset = self.offset + j-i
0054         return j
0055 
0056     _decl_otherchars = ''
0057 
0058     # Internal -- parse declaration (for use by subclasses).
0059     def parse_declaration(self, i):
0060         # This is some sort of declaration; in "HTML as
0061         # deployed," this should only be the document type
0062         # declaration ("<!DOCTYPE html...>").
0063         # ISO 8879:1986, however, has more complex
0064         # declaration syntax for elements in <!...>, including:
0065         # --comment--
0066         # [marked section]
0067         # name in the following list: ENTITY, DOCTYPE, ELEMENT,
0068         # ATTLIST, NOTATION, SHORTREF, USEMAP,
0069         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
0070         rawdata = self.rawdata
0071         j = i + 2
0072         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
0073         if rawdata[j:j+1] in ("-", ""):
0074             # Start of comment followed by buffer boundary,
0075             # or just a buffer boundary.
0076             return -1
0077         # A simple, practical version could look like: ((name|stringlit) S*) + '>'
0078         n = len(rawdata)
0079         if rawdata[j:j+1] == '--': #comment
0080             # Locate --.*-- as the body of the comment
0081             return self.parse_comment(i)
0082         elif rawdata[j] == '[': #marked section
0083             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
0084             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
0085             # Note that this is extended by Microsoft Office "Save as Web" function
0086             # to include [if...] and [endif].
0087             return self.parse_marked_section(i)
0088         else: #all other declaration elements
0089             decltype, j = self._scan_name(j, i)
0090         if j < 0:
0091             return j
0092         if decltype == "doctype":
0093             self._decl_otherchars = ''
0094         while j < n:
0095             c = rawdata[j]
0096             if c == ">":
0097                 # end of declaration syntax
0098                 data = rawdata[i+2:j]
0099                 if decltype == "doctype":
0100                     self.handle_decl(data)
0101                 else:
0102                     self.unknown_decl(data)
0103                 return j + 1
0104             if c in "\"'":
0105                 m = _declstringlit_match(rawdata, j)
0106                 if not m:
0107                     return -1 # incomplete
0108                 j = m.end()
0109             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
0110                 name, j = self._scan_name(j, i)
0111             elif c in self._decl_otherchars:
0112                 j = j + 1
0113             elif c == "[":
0114                 # this could be handled in a separate doctype parser
0115                 if decltype == "doctype":
0116                     j = self._parse_doctype_subset(j + 1, i)
0117                 elif decltype in ("attlist", "linktype", "link", "element"):
0118                     # must tolerate []'d groups in a content model in an element declaration
0119                     # also in data attribute specifications of attlist declaration
0120                     # also link type declaration subsets in linktype declarations
0121                     # also link attribute specification lists in link declarations
0122                     self.error("unsupported '[' char in %s declaration" % decltype)
0123                 else:
0124                     self.error("unexpected '[' char in declaration")
0125             else:
0126                 self.error(
0127                     "unexpected %r char in declaration" % rawdata[j])
0128             if j < 0:
0129                 return j
0130         return -1 # incomplete
0131 
0132     # Internal -- parse a marked section
0133     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
0134     def parse_marked_section( self, i, report=1 ):
0135         rawdata= self.rawdata
0136         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
0137         sectName, j = self._scan_name( i+3, i )
0138         if j < 0:
0139             return j
0140         if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
0141             # look for standard ]]> ending
0142             match= _markedsectionclose.search(rawdata, i+3)
0143         elif sectName in ("if", "else", "endif"):
0144             # look for MS Office ]> ending
0145             match= _msmarkedsectionclose.search(rawdata, i+3)
0146         else:
0147             self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
0148         if not match:
0149             return -1
0150         if report:
0151             j = match.start(0)
0152             self.unknown_decl(rawdata[i+3: j])
0153         return match.end(0)
0154 
0155     # Internal -- parse comment, return length or -1 if not terminated
0156     def parse_comment(self, i, report=1):
0157         rawdata = self.rawdata
0158         if rawdata[i:i+4] != '<!--':
0159             self.error('unexpected call to parse_comment()')
0160         match = _commentclose.search(rawdata, i+4)
0161         if not match:
0162             return -1
0163         if report:
0164             j = match.start(0)
0165             self.handle_comment(rawdata[i+4: j])
0166         return match.end(0)
0167 
0168     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
0169     # returning the index just past any whitespace following the trailing ']'.
0170     def _parse_doctype_subset(self, i, declstartpos):
0171         rawdata = self.rawdata
0172         n = len(rawdata)
0173         j = i
0174         while j < n:
0175             c = rawdata[j]
0176             if c == "<":
0177                 s = rawdata[j:j+2]
0178                 if s == "<":
0179                     # end of buffer; incomplete
0180                     return -1
0181                 if s != "<!":
0182                     self.updatepos(declstartpos, j + 1)
0183                     self.error("unexpected char in internal subset (in %r)" % s)
0184                 if (j + 2) == n:
0185                     # end of buffer; incomplete
0186                     return -1
0187                 if (j + 4) > n:
0188                     # end of buffer; incomplete
0189                     return -1
0190                 if rawdata[j:j+4] == "<!--":
0191                     j = self.parse_comment(j, report=0)
0192                     if j < 0:
0193                         return j
0194                     continue
0195                 name, j = self._scan_name(j + 2, declstartpos)
0196                 if j == -1:
0197                     return -1
0198                 if name not in ("attlist", "element", "entity", "notation"):
0199                     self.updatepos(declstartpos, j + 2)
0200                     self.error(
0201                         "unknown declaration %r in internal subset" % name)
0202                 # handle the individual names
0203                 meth = getattr(self, "_parse_doctype_" + name)
0204                 j = meth(j, declstartpos)
0205                 if j < 0:
0206                     return j
0207             elif c == "%":
0208                 # parameter entity reference
0209                 if (j + 1) == n:
0210                     # end of buffer; incomplete
0211                     return -1
0212                 s, j = self._scan_name(j + 1, declstartpos)
0213                 if j < 0:
0214                     return j
0215                 if rawdata[j] == ";":
0216                     j = j + 1
0217             elif c == "]":
0218                 j = j + 1
0219                 while j < n and rawdata[j].isspace():
0220                     j = j + 1
0221                 if j < n:
0222                     if rawdata[j] == ">":
0223                         return j
0224                     self.updatepos(declstartpos, j)
0225                     self.error("unexpected char after internal subset")
0226                 else:
0227                     return -1
0228             elif c.isspace():
0229                 j = j + 1
0230             else:
0231                 self.updatepos(declstartpos, j)
0232                 self.error("unexpected char %r in internal subset" % c)
0233         # end of buffer reached
0234         return -1
0235 
0236     # Internal -- scan past <!ELEMENT declarations
0237     def _parse_doctype_element(self, i, declstartpos):
0238         name, j = self._scan_name(i, declstartpos)
0239         if j == -1:
0240             return -1
0241         # style content model; just skip until '>'
0242         rawdata = self.rawdata
0243         if '>' in rawdata[j:]:
0244             return rawdata.find(">", j) + 1
0245         return -1
0246 
0247     # Internal -- scan past <!ATTLIST declarations
0248     def _parse_doctype_attlist(self, i, declstartpos):
0249         rawdata = self.rawdata
0250         name, j = self._scan_name(i, declstartpos)
0251         c = rawdata[j:j+1]
0252         if c == "":
0253             return -1
0254         if c == ">":
0255             return j + 1
0256         while 1:
0257             # scan a series of attribute descriptions; simplified:
0258             #   name type [value] [#constraint]
0259             name, j = self._scan_name(j, declstartpos)
0260             if j < 0:
0261                 return j
0262             c = rawdata[j:j+1]
0263             if c == "":
0264                 return -1
0265             if c == "(":
0266                 # an enumerated type; look for ')'
0267                 if ")" in rawdata[j:]:
0268                     j = rawdata.find(")", j) + 1
0269                 else:
0270                     return -1
0271                 while rawdata[j:j+1].isspace():
0272                     j = j + 1
0273                 if not rawdata[j:]:
0274                     # end of buffer, incomplete
0275                     return -1
0276             else:
0277                 name, j = self._scan_name(j, declstartpos)
0278             c = rawdata[j:j+1]
0279             if not c:
0280                 return -1
0281             if c in "'\"":
0282                 m = _declstringlit_match(rawdata, j)
0283                 if m:
0284                     j = m.end()
0285                 else:
0286                     return -1
0287                 c = rawdata[j:j+1]
0288                 if not c:
0289                     return -1
0290             if c == "#":
0291                 if rawdata[j:] == "#":
0292                     # end of buffer
0293                     return -1
0294                 name, j = self._scan_name(j + 1, declstartpos)
0295                 if j < 0:
0296                     return j
0297                 c = rawdata[j:j+1]
0298                 if not c:
0299                     return -1
0300             if c == '>':
0301                 # all done
0302                 return j + 1
0303 
0304     # Internal -- scan past <!NOTATION declarations
0305     def _parse_doctype_notation(self, i, declstartpos):
0306         name, j = self._scan_name(i, declstartpos)
0307         if j < 0:
0308             return j
0309         rawdata = self.rawdata
0310         while 1:
0311             c = rawdata[j:j+1]
0312             if not c:
0313                 # end of buffer; incomplete
0314                 return -1
0315             if c == '>':
0316                 return j + 1
0317             if c in "'\"":
0318                 m = _declstringlit_match(rawdata, j)
0319                 if not m:
0320                     return -1
0321                 j = m.end()
0322             else:
0323                 name, j = self._scan_name(j, declstartpos)
0324                 if j < 0:
0325                     return j
0326 
0327     # Internal -- scan past <!ENTITY declarations
0328     def _parse_doctype_entity(self, i, declstartpos):
0329         rawdata = self.rawdata
0330         if rawdata[i:i+1] == "%":
0331             j = i + 1
0332             while 1:
0333                 c = rawdata[j:j+1]
0334                 if not c:
0335                     return -1
0336                 if c.isspace():
0337                     j = j + 1
0338                 else:
0339                     break
0340         else:
0341             j = i
0342         name, j = self._scan_name(j, declstartpos)
0343         if j < 0:
0344             return j
0345         while 1:
0346             c = self.rawdata[j:j+1]
0347             if not c:
0348                 return -1
0349             if c in "'\"":
0350                 m = _declstringlit_match(rawdata, j)
0351                 if m:
0352                     j = m.end()
0353                 else:
0354                     return -1    # incomplete
0355             elif c == ">":
0356                 return j + 1
0357             else:
0358                 name, j = self._scan_name(j, declstartpos)
0359                 if j < 0:
0360                     return j
0361 
0362     # Internal -- scan a name token and the new position and the token, or
0363     # return -1 if we've reached the end of the buffer.
0364     def _scan_name(self, i, declstartpos):
0365         rawdata = self.rawdata
0366         n = len(rawdata)
0367         if i == n:
0368             return None, -1
0369         m = _declname_match(rawdata, i)
0370         if m:
0371             s = m.group()
0372             name = s.strip()
0373             if (i + len(s)) == n:
0374                 return None, -1  # end of buffer
0375             return name.lower(), m.end()
0376         else:
0377             self.updatepos(declstartpos, i)
0378             self.error("expected name token at %r"
0379                        % rawdata[declstartpos:declstartpos+20])
0380 
0381     # To be overridden -- handlers for unknown objects
0382     def unknown_decl(self, data):
0383         pass
0384
Generated by PyXR 0.9.4