0001 """Shared support for scanning document type declarations in HTML and XHTML.""" 0002 0003 import re 0004 0005 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match 0006 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match 0007 _commentclose = re.compile(r'--\s*>') 0008 _markedsectionclose = re.compile(r']\s*]\s*>') 0009 0010 # An analysis of the MS-Word extensions is available at 0011 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf 0012 0013 _msmarkedsectionclose = re.compile(r']\s*>') 0014 0015 del re 0016 0017 0018 class ParserBase: 0019 """Parser base class which provides some common support methods used 0020 by the SGML/HTML and XHTML parsers.""" 0021 0022 def __init__(self): 0023 if self.__class__ is ParserBase: 0024 raise RuntimeError( 0025 "markupbase.ParserBase must be subclassed") 0026 0027 def error(self, message): 0028 raise NotImplementedError( 0029 "subclasses of ParserBase must override error()") 0030 0031 def reset(self): 0032 self.lineno = 1 0033 self.offset = 0 0034 0035 def getpos(self): 0036 """Return current line number and offset.""" 0037 return self.lineno, self.offset 0038 0039 # Internal -- update line number and offset. This should be 0040 # called for each piece of data exactly once, in order -- in other 0041 # words the concatenation of all the input strings to this 0042 # function should be exactly the entire input. 0043 def updatepos(self, i, j): 0044 if i >= j: 0045 return j 0046 rawdata = self.rawdata 0047 nlines = rawdata.count("\n", i, j) 0048 if nlines: 0049 self.lineno = self.lineno + nlines 0050 pos = rawdata.rindex("\n", i, j) # Should not fail 0051 self.offset = j-(pos+1) 0052 else: 0053 self.offset = self.offset + j-i 0054 return j 0055 0056 _decl_otherchars = '' 0057 0058 # Internal -- parse declaration (for use by subclasses). 0059 def parse_declaration(self, i): 0060 # This is some sort of declaration; in "HTML as 0061 # deployed," this should only be the document type 0062 # declaration ("<!DOCTYPE html...>"). 0063 # ISO 8879:1986, however, has more complex 0064 # declaration syntax for elements in <!...>, including: 0065 # --comment-- 0066 # [marked section] 0067 # name in the following list: ENTITY, DOCTYPE, ELEMENT, 0068 # ATTLIST, NOTATION, SHORTREF, USEMAP, 0069 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM 0070 rawdata = self.rawdata 0071 j = i + 2 0072 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" 0073 if rawdata[j:j+1] in ("-", ""): 0074 # Start of comment followed by buffer boundary, 0075 # or just a buffer boundary. 0076 return -1 0077 # A simple, practical version could look like: ((name|stringlit) S*) + '>' 0078 n = len(rawdata) 0079 if rawdata[j:j+1] == '--': #comment 0080 # Locate --.*-- as the body of the comment 0081 return self.parse_comment(i) 0082 elif rawdata[j] == '[': #marked section 0083 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section 0084 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA 0085 # Note that this is extended by Microsoft Office "Save as Web" function 0086 # to include [if...] and [endif]. 0087 return self.parse_marked_section(i) 0088 else: #all other declaration elements 0089 decltype, j = self._scan_name(j, i) 0090 if j < 0: 0091 return j 0092 if decltype == "doctype": 0093 self._decl_otherchars = '' 0094 while j < n: 0095 c = rawdata[j] 0096 if c == ">": 0097 # end of declaration syntax 0098 data = rawdata[i+2:j] 0099 if decltype == "doctype": 0100 self.handle_decl(data) 0101 else: 0102 self.unknown_decl(data) 0103 return j + 1 0104 if c in "\"'": 0105 m = _declstringlit_match(rawdata, j) 0106 if not m: 0107 return -1 # incomplete 0108 j = m.end() 0109 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": 0110 name, j = self._scan_name(j, i) 0111 elif c in self._decl_otherchars: 0112 j = j + 1 0113 elif c == "[": 0114 # this could be handled in a separate doctype parser 0115 if decltype == "doctype": 0116 j = self._parse_doctype_subset(j + 1, i) 0117 elif decltype in ("attlist", "linktype", "link", "element"): 0118 # must tolerate []'d groups in a content model in an element declaration 0119 # also in data attribute specifications of attlist declaration 0120 # also link type declaration subsets in linktype declarations 0121 # also link attribute specification lists in link declarations 0122 self.error("unsupported '[' char in %s declaration" % decltype) 0123 else: 0124 self.error("unexpected '[' char in declaration") 0125 else: 0126 self.error( 0127 "unexpected %r char in declaration" % rawdata[j]) 0128 if j < 0: 0129 return j 0130 return -1 # incomplete 0131 0132 # Internal -- parse a marked section 0133 # Override this to handle MS-word extension syntax <![if word]>content<![endif]> 0134 def parse_marked_section( self, i, report=1 ): 0135 rawdata= self.rawdata 0136 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" 0137 sectName, j = self._scan_name( i+3, i ) 0138 if j < 0: 0139 return j 0140 if sectName in ("temp", "cdata", "ignore", "include", "rcdata"): 0141 # look for standard ]]> ending 0142 match= _markedsectionclose.search(rawdata, i+3) 0143 elif sectName in ("if", "else", "endif"): 0144 # look for MS Office ]> ending 0145 match= _msmarkedsectionclose.search(rawdata, i+3) 0146 else: 0147 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) 0148 if not match: 0149 return -1 0150 if report: 0151 j = match.start(0) 0152 self.unknown_decl(rawdata[i+3: j]) 0153 return match.end(0) 0154 0155 # Internal -- parse comment, return length or -1 if not terminated 0156 def parse_comment(self, i, report=1): 0157 rawdata = self.rawdata 0158 if rawdata[i:i+4] != '<!--': 0159 self.error('unexpected call to parse_comment()') 0160 match = _commentclose.search(rawdata, i+4) 0161 if not match: 0162 return -1 0163 if report: 0164 j = match.start(0) 0165 self.handle_comment(rawdata[i+4: j]) 0166 return match.end(0) 0167 0168 # Internal -- scan past the internal subset in a <!DOCTYPE declaration, 0169 # returning the index just past any whitespace following the trailing ']'. 0170 def _parse_doctype_subset(self, i, declstartpos): 0171 rawdata = self.rawdata 0172 n = len(rawdata) 0173 j = i 0174 while j < n: 0175 c = rawdata[j] 0176 if c == "<": 0177 s = rawdata[j:j+2] 0178 if s == "<": 0179 # end of buffer; incomplete 0180 return -1 0181 if s != "<!": 0182 self.updatepos(declstartpos, j + 1) 0183 self.error("unexpected char in internal subset (in %r)" % s) 0184 if (j + 2) == n: 0185 # end of buffer; incomplete 0186 return -1 0187 if (j + 4) > n: 0188 # end of buffer; incomplete 0189 return -1 0190 if rawdata[j:j+4] == "<!--": 0191 j = self.parse_comment(j, report=0) 0192 if j < 0: 0193 return j 0194 continue 0195 name, j = self._scan_name(j + 2, declstartpos) 0196 if j == -1: 0197 return -1 0198 if name not in ("attlist", "element", "entity", "notation"): 0199 self.updatepos(declstartpos, j + 2) 0200 self.error( 0201 "unknown declaration %r in internal subset" % name) 0202 # handle the individual names 0203 meth = getattr(self, "_parse_doctype_" + name) 0204 j = meth(j, declstartpos) 0205 if j < 0: 0206 return j 0207 elif c == "%": 0208 # parameter entity reference 0209 if (j + 1) == n: 0210 # end of buffer; incomplete 0211 return -1 0212 s, j = self._scan_name(j + 1, declstartpos) 0213 if j < 0: 0214 return j 0215 if rawdata[j] == ";": 0216 j = j + 1 0217 elif c == "]": 0218 j = j + 1 0219 while j < n and rawdata[j].isspace(): 0220 j = j + 1 0221 if j < n: 0222 if rawdata[j] == ">": 0223 return j 0224 self.updatepos(declstartpos, j) 0225 self.error("unexpected char after internal subset") 0226 else: 0227 return -1 0228 elif c.isspace(): 0229 j = j + 1 0230 else: 0231 self.updatepos(declstartpos, j) 0232 self.error("unexpected char %r in internal subset" % c) 0233 # end of buffer reached 0234 return -1 0235 0236 # Internal -- scan past <!ELEMENT declarations 0237 def _parse_doctype_element(self, i, declstartpos): 0238 name, j = self._scan_name(i, declstartpos) 0239 if j == -1: 0240 return -1 0241 # style content model; just skip until '>' 0242 rawdata = self.rawdata 0243 if '>' in rawdata[j:]: 0244 return rawdata.find(">", j) + 1 0245 return -1 0246 0247 # Internal -- scan past <!ATTLIST declarations 0248 def _parse_doctype_attlist(self, i, declstartpos): 0249 rawdata = self.rawdata 0250 name, j = self._scan_name(i, declstartpos) 0251 c = rawdata[j:j+1] 0252 if c == "": 0253 return -1 0254 if c == ">": 0255 return j + 1 0256 while 1: 0257 # scan a series of attribute descriptions; simplified: 0258 # name type [value] [#constraint] 0259 name, j = self._scan_name(j, declstartpos) 0260 if j < 0: 0261 return j 0262 c = rawdata[j:j+1] 0263 if c == "": 0264 return -1 0265 if c == "(": 0266 # an enumerated type; look for ')' 0267 if ")" in rawdata[j:]: 0268 j = rawdata.find(")", j) + 1 0269 else: 0270 return -1 0271 while rawdata[j:j+1].isspace(): 0272 j = j + 1 0273 if not rawdata[j:]: 0274 # end of buffer, incomplete 0275 return -1 0276 else: 0277 name, j = self._scan_name(j, declstartpos) 0278 c = rawdata[j:j+1] 0279 if not c: 0280 return -1 0281 if c in "'\"": 0282 m = _declstringlit_match(rawdata, j) 0283 if m: 0284 j = m.end() 0285 else: 0286 return -1 0287 c = rawdata[j:j+1] 0288 if not c: 0289 return -1 0290 if c == "#": 0291 if rawdata[j:] == "#": 0292 # end of buffer 0293 return -1 0294 name, j = self._scan_name(j + 1, declstartpos) 0295 if j < 0: 0296 return j 0297 c = rawdata[j:j+1] 0298 if not c: 0299 return -1 0300 if c == '>': 0301 # all done 0302 return j + 1 0303 0304 # Internal -- scan past <!NOTATION declarations 0305 def _parse_doctype_notation(self, i, declstartpos): 0306 name, j = self._scan_name(i, declstartpos) 0307 if j < 0: 0308 return j 0309 rawdata = self.rawdata 0310 while 1: 0311 c = rawdata[j:j+1] 0312 if not c: 0313 # end of buffer; incomplete 0314 return -1 0315 if c == '>': 0316 return j + 1 0317 if c in "'\"": 0318 m = _declstringlit_match(rawdata, j) 0319 if not m: 0320 return -1 0321 j = m.end() 0322 else: 0323 name, j = self._scan_name(j, declstartpos) 0324 if j < 0: 0325 return j 0326 0327 # Internal -- scan past <!ENTITY declarations 0328 def _parse_doctype_entity(self, i, declstartpos): 0329 rawdata = self.rawdata 0330 if rawdata[i:i+1] == "%": 0331 j = i + 1 0332 while 1: 0333 c = rawdata[j:j+1] 0334 if not c: 0335 return -1 0336 if c.isspace(): 0337 j = j + 1 0338 else: 0339 break 0340 else: 0341 j = i 0342 name, j = self._scan_name(j, declstartpos) 0343 if j < 0: 0344 return j 0345 while 1: 0346 c = self.rawdata[j:j+1] 0347 if not c: 0348 return -1 0349 if c in "'\"": 0350 m = _declstringlit_match(rawdata, j) 0351 if m: 0352 j = m.end() 0353 else: 0354 return -1 # incomplete 0355 elif c == ">": 0356 return j + 1 0357 else: 0358 name, j = self._scan_name(j, declstartpos) 0359 if j < 0: 0360 return j 0361 0362 # Internal -- scan a name token and the new position and the token, or 0363 # return -1 if we've reached the end of the buffer. 0364 def _scan_name(self, i, declstartpos): 0365 rawdata = self.rawdata 0366 n = len(rawdata) 0367 if i == n: 0368 return None, -1 0369 m = _declname_match(rawdata, i) 0370 if m: 0371 s = m.group() 0372 name = s.strip() 0373 if (i + len(s)) == n: 0374 return None, -1 # end of buffer 0375 return name.lower(), m.end() 0376 else: 0377 self.updatepos(declstartpos, i) 0378 self.error("expected name token at %r" 0379 % rawdata[declstartpos:declstartpos+20]) 0380 0381 # To be overridden -- handlers for unknown objects 0382 def unknown_decl(self, data): 0383 pass 0384
Generated by PyXR 0.9.4