0001 """A parser for SGML, using the derived class as a static DTD.""" 0002 0003 # XXX This only supports those SGML features used by HTML. 0004 0005 # XXX There should be a way to distinguish between PCDATA (parsed 0006 # character data -- the normal case), RCDATA (replaceable character 0007 # data -- only char and entity references and end tags are special) 0008 # and CDATA (character data -- only end tags are special). RCDATA is 0009 # not supported at all. 0010 0011 0012 import markupbase 0013 import re 0014 0015 __all__ = ["SGMLParser", "SGMLParseError"] 0016 0017 # Regular expressions used for parsing 0018 0019 interesting = re.compile('[&<]') 0020 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' 0021 '<([a-zA-Z][^<>]*|' 0022 '/([a-zA-Z][^<>]*)?|' 0023 '![^<>]*)?') 0024 0025 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 0026 charref = re.compile('&#([0-9]+)[^0-9]') 0027 0028 starttagopen = re.compile('<[>a-zA-Z]') 0029 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') 0030 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') 0031 piclose = re.compile('>') 0032 endbracket = re.compile('[<>]') 0033 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') 0034 attrfind = re.compile( 0035 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' 0036 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') 0037 0038 0039 class SGMLParseError(RuntimeError): 0040 """Exception raised for all parse errors.""" 0041 pass 0042 0043 0044 # SGML parser base class -- find tags and call handler functions. 0045 # Usage: p = SGMLParser(); p.feed(data); ...; p.close(). 0046 # The dtd is defined by deriving a class which defines methods 0047 # with special names to handle tags: start_foo and end_foo to handle 0048 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself. 0049 # (Tags are converted to lower case for this purpose.) The data 0050 # between tags is passed to the parser by calling self.handle_data() 0051 # with some data as argument (the data may be split up in arbitrary 0052 # chunks). Entity references are passed by calling 0053 # self.handle_entityref() with the entity reference as argument. 0054 0055 class SGMLParser(markupbase.ParserBase): 0056 0057 def __init__(self, verbose=0): 0058 """Initialize and reset this instance.""" 0059 self.verbose = verbose 0060 self.reset() 0061 0062 def reset(self): 0063 """Reset this instance. Loses all unprocessed data.""" 0064 self.__starttag_text = None 0065 self.rawdata = '' 0066 self.stack = [] 0067 self.lasttag = '???' 0068 self.nomoretags = 0 0069 self.literal = 0 0070 markupbase.ParserBase.reset(self) 0071 0072 def setnomoretags(self): 0073 """Enter literal mode (CDATA) till EOF. 0074 0075 Intended for derived classes only. 0076 """ 0077 self.nomoretags = self.literal = 1 0078 0079 def setliteral(self, *args): 0080 """Enter literal mode (CDATA). 0081 0082 Intended for derived classes only. 0083 """ 0084 self.literal = 1 0085 0086 def feed(self, data): 0087 """Feed some data to the parser. 0088 0089 Call this as often as you want, with as little or as much text 0090 as you want (may include '\n'). (This just saves the text, 0091 all the processing is done by goahead().) 0092 """ 0093 0094 self.rawdata = self.rawdata + data 0095 self.goahead(0) 0096 0097 def close(self): 0098 """Handle the remaining data.""" 0099 self.goahead(1) 0100 0101 def error(self, message): 0102 raise SGMLParseError(message) 0103 0104 # Internal -- handle data as far as reasonable. May leave state 0105 # and data to be processed by a subsequent call. If 'end' is 0106 # true, force handling all data as if followed by EOF marker. 0107 def goahead(self, end): 0108 rawdata = self.rawdata 0109 i = 0 0110 n = len(rawdata) 0111 while i < n: 0112 if self.nomoretags: 0113 self.handle_data(rawdata[i:n]) 0114 i = n 0115 break 0116 match = interesting.search(rawdata, i) 0117 if match: j = match.start() 0118 else: j = n 0119 if i < j: 0120 self.handle_data(rawdata[i:j]) 0121 i = j 0122 if i == n: break 0123 if rawdata[i] == '<': 0124 if starttagopen.match(rawdata, i): 0125 if self.literal: 0126 self.handle_data(rawdata[i]) 0127 i = i+1 0128 continue 0129 k = self.parse_starttag(i) 0130 if k < 0: break 0131 i = k 0132 continue 0133 if rawdata.startswith("</", i): 0134 k = self.parse_endtag(i) 0135 if k < 0: break 0136 i = k 0137 self.literal = 0 0138 continue 0139 if self.literal: 0140 if n > (i + 1): 0141 self.handle_data("<") 0142 i = i+1 0143 else: 0144 # incomplete 0145 break 0146 continue 0147 if rawdata.startswith("<!--", i): 0148 # Strictly speaking, a comment is --.*-- 0149 # within a declaration tag <!...>. 0150 # This should be removed, 0151 # and comments handled only in parse_declaration. 0152 k = self.parse_comment(i) 0153 if k < 0: break 0154 i = k 0155 continue 0156 if rawdata.startswith("<?", i): 0157 k = self.parse_pi(i) 0158 if k < 0: break 0159 i = i+k 0160 continue 0161 if rawdata.startswith("<!", i): 0162 # This is some sort of declaration; in "HTML as 0163 # deployed," this should only be the document type 0164 # declaration ("<!DOCTYPE html...>"). 0165 k = self.parse_declaration(i) 0166 if k < 0: break 0167 i = k 0168 continue 0169 elif rawdata[i] == '&': 0170 if self.literal: 0171 self.handle_data(rawdata[i]) 0172 i = i+1 0173 continue 0174 match = charref.match(rawdata, i) 0175 if match: 0176 name = match.group(1) 0177 self.handle_charref(name) 0178 i = match.end(0) 0179 if rawdata[i-1] != ';': i = i-1 0180 continue 0181 match = entityref.match(rawdata, i) 0182 if match: 0183 name = match.group(1) 0184 self.handle_entityref(name) 0185 i = match.end(0) 0186 if rawdata[i-1] != ';': i = i-1 0187 continue 0188 else: 0189 self.error('neither < nor & ??') 0190 # We get here only if incomplete matches but 0191 # nothing else 0192 match = incomplete.match(rawdata, i) 0193 if not match: 0194 self.handle_data(rawdata[i]) 0195 i = i+1 0196 continue 0197 j = match.end(0) 0198 if j == n: 0199 break # Really incomplete 0200 self.handle_data(rawdata[i:j]) 0201 i = j 0202 # end while 0203 if end and i < n: 0204 self.handle_data(rawdata[i:n]) 0205 i = n 0206 self.rawdata = rawdata[i:] 0207 # XXX if end: check for empty stack 0208 0209 # Extensions for the DOCTYPE scanner: 0210 _decl_otherchars = '=' 0211 0212 # Internal -- parse processing instr, return length or -1 if not terminated 0213 def parse_pi(self, i): 0214 rawdata = self.rawdata 0215 if rawdata[i:i+2] != '<?': 0216 self.error('unexpected call to parse_pi()') 0217 match = piclose.search(rawdata, i+2) 0218 if not match: 0219 return -1 0220 j = match.start(0) 0221 self.handle_pi(rawdata[i+2: j]) 0222 j = match.end(0) 0223 return j-i 0224 0225 def get_starttag_text(self): 0226 return self.__starttag_text 0227 0228 # Internal -- handle starttag, return length or -1 if not terminated 0229 def parse_starttag(self, i): 0230 self.__starttag_text = None 0231 start_pos = i 0232 rawdata = self.rawdata 0233 if shorttagopen.match(rawdata, i): 0234 # SGML shorthand: <tag/data/ == <tag>data</tag> 0235 # XXX Can data contain &... (entity or char refs)? 0236 # XXX Can data contain < or > (tag characters)? 0237 # XXX Can there be whitespace before the first /? 0238 match = shorttag.match(rawdata, i) 0239 if not match: 0240 return -1 0241 tag, data = match.group(1, 2) 0242 self.__starttag_text = '<%s/' % tag 0243 tag = tag.lower() 0244 k = match.end(0) 0245 self.finish_shorttag(tag, data) 0246 self.__starttag_text = rawdata[start_pos:match.end(1) + 1] 0247 return k 0248 # XXX The following should skip matching quotes (' or ") 0249 match = endbracket.search(rawdata, i+1) 0250 if not match: 0251 return -1 0252 j = match.start(0) 0253 # Now parse the data between i+1 and j into a tag and attrs 0254 attrs = [] 0255 if rawdata[i:i+2] == '<>': 0256 # SGML shorthand: <> == <last open tag seen> 0257 k = j 0258 tag = self.lasttag 0259 else: 0260 match = tagfind.match(rawdata, i+1) 0261 if not match: 0262 self.error('unexpected call to parse_starttag') 0263 k = match.end(0) 0264 tag = rawdata[i+1:k].lower() 0265 self.lasttag = tag 0266 while k < j: 0267 match = attrfind.match(rawdata, k) 0268 if not match: break 0269 attrname, rest, attrvalue = match.group(1, 2, 3) 0270 if not rest: 0271 attrvalue = attrname 0272 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 0273 attrvalue[:1] == '"' == attrvalue[-1:]: 0274 attrvalue = attrvalue[1:-1] 0275 attrs.append((attrname.lower(), attrvalue)) 0276 k = match.end(0) 0277 if rawdata[j] == '>': 0278 j = j+1 0279 self.__starttag_text = rawdata[start_pos:j] 0280 self.finish_starttag(tag, attrs) 0281 return j 0282 0283 # Internal -- parse endtag 0284 def parse_endtag(self, i): 0285 rawdata = self.rawdata 0286 match = endbracket.search(rawdata, i+1) 0287 if not match: 0288 return -1 0289 j = match.start(0) 0290 tag = rawdata[i+2:j].strip().lower() 0291 if rawdata[j] == '>': 0292 j = j+1 0293 self.finish_endtag(tag) 0294 return j 0295 0296 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>) 0297 def finish_shorttag(self, tag, data): 0298 self.finish_starttag(tag, []) 0299 self.handle_data(data) 0300 self.finish_endtag(tag) 0301 0302 # Internal -- finish processing of start tag 0303 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag 0304 def finish_starttag(self, tag, attrs): 0305 try: 0306 method = getattr(self, 'start_' + tag) 0307 except AttributeError: 0308 try: 0309 method = getattr(self, 'do_' + tag) 0310 except AttributeError: 0311 self.unknown_starttag(tag, attrs) 0312 return -1 0313 else: 0314 self.handle_starttag(tag, method, attrs) 0315 return 0 0316 else: 0317 self.stack.append(tag) 0318 self.handle_starttag(tag, method, attrs) 0319 return 1 0320 0321 # Internal -- finish processing of end tag 0322 def finish_endtag(self, tag): 0323 if not tag: 0324 found = len(self.stack) - 1 0325 if found < 0: 0326 self.unknown_endtag(tag) 0327 return 0328 else: 0329 if tag not in self.stack: 0330 try: 0331 method = getattr(self, 'end_' + tag) 0332 except AttributeError: 0333 self.unknown_endtag(tag) 0334 else: 0335 self.report_unbalanced(tag) 0336 return 0337 found = len(self.stack) 0338 for i in range(found): 0339 if self.stack[i] == tag: found = i 0340 while len(self.stack) > found: 0341 tag = self.stack[-1] 0342 try: 0343 method = getattr(self, 'end_' + tag) 0344 except AttributeError: 0345 method = None 0346 if method: 0347 self.handle_endtag(tag, method) 0348 else: 0349 self.unknown_endtag(tag) 0350 del self.stack[-1] 0351 0352 # Overridable -- handle start tag 0353 def handle_starttag(self, tag, method, attrs): 0354 method(attrs) 0355 0356 # Overridable -- handle end tag 0357 def handle_endtag(self, tag, method): 0358 method() 0359 0360 # Example -- report an unbalanced </...> tag. 0361 def report_unbalanced(self, tag): 0362 if self.verbose: 0363 print '*** Unbalanced </' + tag + '>' 0364 print '*** Stack:', self.stack 0365 0366 def handle_charref(self, name): 0367 """Handle character reference, no need to override.""" 0368 try: 0369 n = int(name) 0370 except ValueError: 0371 self.unknown_charref(name) 0372 return 0373 if not 0 <= n <= 255: 0374 self.unknown_charref(name) 0375 return 0376 self.handle_data(chr(n)) 0377 0378 # Definition of entities -- derived classes may override 0379 entitydefs = \ 0380 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} 0381 0382 def handle_entityref(self, name): 0383 """Handle entity references. 0384 0385 There should be no need to override this method; it can be 0386 tailored by setting up the self.entitydefs mapping appropriately. 0387 """ 0388 table = self.entitydefs 0389 if name in table: 0390 self.handle_data(table[name]) 0391 else: 0392 self.unknown_entityref(name) 0393 return 0394 0395 # Example -- handle data, should be overridden 0396 def handle_data(self, data): 0397 pass 0398 0399 # Example -- handle comment, could be overridden 0400 def handle_comment(self, data): 0401 pass 0402 0403 # Example -- handle declaration, could be overridden 0404 def handle_decl(self, decl): 0405 pass 0406 0407 # Example -- handle processing instruction, could be overridden 0408 def handle_pi(self, data): 0409 pass 0410 0411 # To be overridden -- handlers for unknown objects 0412 def unknown_starttag(self, tag, attrs): pass 0413 def unknown_endtag(self, tag): pass 0414 def unknown_charref(self, ref): pass 0415 def unknown_entityref(self, ref): pass 0416 0417 0418 class TestSGMLParser(SGMLParser): 0419 0420 def __init__(self, verbose=0): 0421 self.testdata = "" 0422 SGMLParser.__init__(self, verbose) 0423 0424 def handle_data(self, data): 0425 self.testdata = self.testdata + data 0426 if len(repr(self.testdata)) >= 70: 0427 self.flush() 0428 0429 def flush(self): 0430 data = self.testdata 0431 if data: 0432 self.testdata = "" 0433 print 'data:', repr(data) 0434 0435 def handle_comment(self, data): 0436 self.flush() 0437 r = repr(data) 0438 if len(r) > 68: 0439 r = r[:32] + '...' + r[-32:] 0440 print 'comment:', r 0441 0442 def unknown_starttag(self, tag, attrs): 0443 self.flush() 0444 if not attrs: 0445 print 'start tag: <' + tag + '>' 0446 else: 0447 print 'start tag: <' + tag, 0448 for name, value in attrs: 0449 print name + '=' + '"' + value + '"', 0450 print '>' 0451 0452 def unknown_endtag(self, tag): 0453 self.flush() 0454 print 'end tag: </' + tag + '>' 0455 0456 def unknown_entityref(self, ref): 0457 self.flush() 0458 print '*** unknown entity ref: &' + ref + ';' 0459 0460 def unknown_charref(self, ref): 0461 self.flush() 0462 print '*** unknown char ref: &#' + ref + ';' 0463 0464 def unknown_decl(self, data): 0465 self.flush() 0466 print '*** unknown decl: [' + data + ']' 0467 0468 def close(self): 0469 SGMLParser.close(self) 0470 self.flush() 0471 0472 0473 def test(args = None): 0474 import sys 0475 0476 if args is None: 0477 args = sys.argv[1:] 0478 0479 if args and args[0] == '-s': 0480 args = args[1:] 0481 klass = SGMLParser 0482 else: 0483 klass = TestSGMLParser 0484 0485 if args: 0486 file = args[0] 0487 else: 0488 file = 'test.html' 0489 0490 if file == '-': 0491 f = sys.stdin 0492 else: 0493 try: 0494 f = open(file, 'r') 0495 except IOError, msg: 0496 print file, ":", msg 0497 sys.exit(1) 0498 0499 data = f.read() 0500 if f is not sys.stdin: 0501 f.close() 0502 0503 x = klass() 0504 for c in data: 0505 x.feed(c) 0506 x.close() 0507 0508 0509 if __name__ == '__main__': 0510 test() 0511
Generated by PyXR 0.9.4