PyXR

c:\python24\lib \ sgmllib.py


0001 """A parser for SGML, using the derived class as a static DTD."""
0002 
0003 # XXX This only supports those SGML features used by HTML.
0004 
0005 # XXX There should be a way to distinguish between PCDATA (parsed
0006 # character data -- the normal case), RCDATA (replaceable character
0007 # data -- only char and entity references and end tags are special)
0008 # and CDATA (character data -- only end tags are special).  RCDATA is
0009 # not supported at all.
0010 
0011 
0012 import markupbase
0013 import re
0014 
0015 __all__ = ["SGMLParser", "SGMLParseError"]
0016 
0017 # Regular expressions used for parsing
0018 
0019 interesting = re.compile('[&<]')
0020 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
0021                            '<([a-zA-Z][^<>]*|'
0022                               '/([a-zA-Z][^<>]*)?|'
0023                               '![^<>]*)?')
0024 
0025 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
0026 charref = re.compile('&#([0-9]+)[^0-9]')
0027 
0028 starttagopen = re.compile('<[>a-zA-Z]')
0029 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
0030 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
0031 piclose = re.compile('>')
0032 endbracket = re.compile('[<>]')
0033 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
0034 attrfind = re.compile(
0035     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
0036     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
0037 
0038 
0039 class SGMLParseError(RuntimeError):
0040     """Exception raised for all parse errors."""
0041     pass
0042 
0043 
0044 # SGML parser base class -- find tags and call handler functions.
0045 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
0046 # The dtd is defined by deriving a class which defines methods
0047 # with special names to handle tags: start_foo and end_foo to handle
0048 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
0049 # (Tags are converted to lower case for this purpose.)  The data
0050 # between tags is passed to the parser by calling self.handle_data()
0051 # with some data as argument (the data may be split up in arbitrary
0052 # chunks).  Entity references are passed by calling
0053 # self.handle_entityref() with the entity reference as argument.
0054 
0055 class SGMLParser(markupbase.ParserBase):
0056 
0057     def __init__(self, verbose=0):
0058         """Initialize and reset this instance."""
0059         self.verbose = verbose
0060         self.reset()
0061 
0062     def reset(self):
0063         """Reset this instance. Loses all unprocessed data."""
0064         self.__starttag_text = None
0065         self.rawdata = ''
0066         self.stack = []
0067         self.lasttag = '???'
0068         self.nomoretags = 0
0069         self.literal = 0
0070         markupbase.ParserBase.reset(self)
0071 
0072     def setnomoretags(self):
0073         """Enter literal mode (CDATA) till EOF.
0074 
0075         Intended for derived classes only.
0076         """
0077         self.nomoretags = self.literal = 1
0078 
0079     def setliteral(self, *args):
0080         """Enter literal mode (CDATA).
0081 
0082         Intended for derived classes only.
0083         """
0084         self.literal = 1
0085 
0086     def feed(self, data):
0087         """Feed some data to the parser.
0088 
0089         Call this as often as you want, with as little or as much text
0090         as you want (may include '\n').  (This just saves the text,
0091         all the processing is done by goahead().)
0092         """
0093 
0094         self.rawdata = self.rawdata + data
0095         self.goahead(0)
0096 
0097     def close(self):
0098         """Handle the remaining data."""
0099         self.goahead(1)
0100 
0101     def error(self, message):
0102         raise SGMLParseError(message)
0103 
0104     # Internal -- handle data as far as reasonable.  May leave state
0105     # and data to be processed by a subsequent call.  If 'end' is
0106     # true, force handling all data as if followed by EOF marker.
0107     def goahead(self, end):
0108         rawdata = self.rawdata
0109         i = 0
0110         n = len(rawdata)
0111         while i < n:
0112             if self.nomoretags:
0113                 self.handle_data(rawdata[i:n])
0114                 i = n
0115                 break
0116             match = interesting.search(rawdata, i)
0117             if match: j = match.start()
0118             else: j = n
0119             if i < j:
0120                 self.handle_data(rawdata[i:j])
0121             i = j
0122             if i == n: break
0123             if rawdata[i] == '<':
0124                 if starttagopen.match(rawdata, i):
0125                     if self.literal:
0126                         self.handle_data(rawdata[i])
0127                         i = i+1
0128                         continue
0129                     k = self.parse_starttag(i)
0130                     if k < 0: break
0131                     i = k
0132                     continue
0133                 if rawdata.startswith("</", i):
0134                     k = self.parse_endtag(i)
0135                     if k < 0: break
0136                     i = k
0137                     self.literal = 0
0138                     continue
0139                 if self.literal:
0140                     if n > (i + 1):
0141                         self.handle_data("<")
0142                         i = i+1
0143                     else:
0144                         # incomplete
0145                         break
0146                     continue
0147                 if rawdata.startswith("<!--", i):
0148                         # Strictly speaking, a comment is --.*--
0149                         # within a declaration tag <!...>.
0150                         # This should be removed,
0151                         # and comments handled only in parse_declaration.
0152                     k = self.parse_comment(i)
0153                     if k < 0: break
0154                     i = k
0155                     continue
0156                 if rawdata.startswith("<?", i):
0157                     k = self.parse_pi(i)
0158                     if k < 0: break
0159                     i = i+k
0160                     continue
0161                 if rawdata.startswith("<!", i):
0162                     # This is some sort of declaration; in "HTML as
0163                     # deployed," this should only be the document type
0164                     # declaration ("<!DOCTYPE html...>").
0165                     k = self.parse_declaration(i)
0166                     if k < 0: break
0167                     i = k
0168                     continue
0169             elif rawdata[i] == '&':
0170                 if self.literal:
0171                     self.handle_data(rawdata[i])
0172                     i = i+1
0173                     continue
0174                 match = charref.match(rawdata, i)
0175                 if match:
0176                     name = match.group(1)
0177                     self.handle_charref(name)
0178                     i = match.end(0)
0179                     if rawdata[i-1] != ';': i = i-1
0180                     continue
0181                 match = entityref.match(rawdata, i)
0182                 if match:
0183                     name = match.group(1)
0184                     self.handle_entityref(name)
0185                     i = match.end(0)
0186                     if rawdata[i-1] != ';': i = i-1
0187                     continue
0188             else:
0189                 self.error('neither < nor & ??')
0190             # We get here only if incomplete matches but
0191             # nothing else
0192             match = incomplete.match(rawdata, i)
0193             if not match:
0194                 self.handle_data(rawdata[i])
0195                 i = i+1
0196                 continue
0197             j = match.end(0)
0198             if j == n:
0199                 break # Really incomplete
0200             self.handle_data(rawdata[i:j])
0201             i = j
0202         # end while
0203         if end and i < n:
0204             self.handle_data(rawdata[i:n])
0205             i = n
0206         self.rawdata = rawdata[i:]
0207         # XXX if end: check for empty stack
0208 
0209     # Extensions for the DOCTYPE scanner:
0210     _decl_otherchars = '='
0211 
0212     # Internal -- parse processing instr, return length or -1 if not terminated
0213     def parse_pi(self, i):
0214         rawdata = self.rawdata
0215         if rawdata[i:i+2] != '<?':
0216             self.error('unexpected call to parse_pi()')
0217         match = piclose.search(rawdata, i+2)
0218         if not match:
0219             return -1
0220         j = match.start(0)
0221         self.handle_pi(rawdata[i+2: j])
0222         j = match.end(0)
0223         return j-i
0224 
0225     def get_starttag_text(self):
0226         return self.__starttag_text
0227 
0228     # Internal -- handle starttag, return length or -1 if not terminated
0229     def parse_starttag(self, i):
0230         self.__starttag_text = None
0231         start_pos = i
0232         rawdata = self.rawdata
0233         if shorttagopen.match(rawdata, i):
0234             # SGML shorthand: <tag/data/ == <tag>data</tag>
0235             # XXX Can data contain &... (entity or char refs)?
0236             # XXX Can data contain < or > (tag characters)?
0237             # XXX Can there be whitespace before the first /?
0238             match = shorttag.match(rawdata, i)
0239             if not match:
0240                 return -1
0241             tag, data = match.group(1, 2)
0242             self.__starttag_text = '<%s/' % tag
0243             tag = tag.lower()
0244             k = match.end(0)
0245             self.finish_shorttag(tag, data)
0246             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
0247             return k
0248         # XXX The following should skip matching quotes (' or ")
0249         match = endbracket.search(rawdata, i+1)
0250         if not match:
0251             return -1
0252         j = match.start(0)
0253         # Now parse the data between i+1 and j into a tag and attrs
0254         attrs = []
0255         if rawdata[i:i+2] == '<>':
0256             # SGML shorthand: <> == <last open tag seen>
0257             k = j
0258             tag = self.lasttag
0259         else:
0260             match = tagfind.match(rawdata, i+1)
0261             if not match:
0262                 self.error('unexpected call to parse_starttag')
0263             k = match.end(0)
0264             tag = rawdata[i+1:k].lower()
0265             self.lasttag = tag
0266         while k < j:
0267             match = attrfind.match(rawdata, k)
0268             if not match: break
0269             attrname, rest, attrvalue = match.group(1, 2, 3)
0270             if not rest:
0271                 attrvalue = attrname
0272             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
0273                  attrvalue[:1] == '"' == attrvalue[-1:]:
0274                 attrvalue = attrvalue[1:-1]
0275             attrs.append((attrname.lower(), attrvalue))
0276             k = match.end(0)
0277         if rawdata[j] == '>':
0278             j = j+1
0279         self.__starttag_text = rawdata[start_pos:j]
0280         self.finish_starttag(tag, attrs)
0281         return j
0282 
0283     # Internal -- parse endtag
0284     def parse_endtag(self, i):
0285         rawdata = self.rawdata
0286         match = endbracket.search(rawdata, i+1)
0287         if not match:
0288             return -1
0289         j = match.start(0)
0290         tag = rawdata[i+2:j].strip().lower()
0291         if rawdata[j] == '>':
0292             j = j+1
0293         self.finish_endtag(tag)
0294         return j
0295 
0296     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
0297     def finish_shorttag(self, tag, data):
0298         self.finish_starttag(tag, [])
0299         self.handle_data(data)
0300         self.finish_endtag(tag)
0301 
0302     # Internal -- finish processing of start tag
0303     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
0304     def finish_starttag(self, tag, attrs):
0305         try:
0306             method = getattr(self, 'start_' + tag)
0307         except AttributeError:
0308             try:
0309                 method = getattr(self, 'do_' + tag)
0310             except AttributeError:
0311                 self.unknown_starttag(tag, attrs)
0312                 return -1
0313             else:
0314                 self.handle_starttag(tag, method, attrs)
0315                 return 0
0316         else:
0317             self.stack.append(tag)
0318             self.handle_starttag(tag, method, attrs)
0319             return 1
0320 
0321     # Internal -- finish processing of end tag
0322     def finish_endtag(self, tag):
0323         if not tag:
0324             found = len(self.stack) - 1
0325             if found < 0:
0326                 self.unknown_endtag(tag)
0327                 return
0328         else:
0329             if tag not in self.stack:
0330                 try:
0331                     method = getattr(self, 'end_' + tag)
0332                 except AttributeError:
0333                     self.unknown_endtag(tag)
0334                 else:
0335                     self.report_unbalanced(tag)
0336                 return
0337             found = len(self.stack)
0338             for i in range(found):
0339                 if self.stack[i] == tag: found = i
0340         while len(self.stack) > found:
0341             tag = self.stack[-1]
0342             try:
0343                 method = getattr(self, 'end_' + tag)
0344             except AttributeError:
0345                 method = None
0346             if method:
0347                 self.handle_endtag(tag, method)
0348             else:
0349                 self.unknown_endtag(tag)
0350             del self.stack[-1]
0351 
0352     # Overridable -- handle start tag
0353     def handle_starttag(self, tag, method, attrs):
0354         method(attrs)
0355 
0356     # Overridable -- handle end tag
0357     def handle_endtag(self, tag, method):
0358         method()
0359 
0360     # Example -- report an unbalanced </...> tag.
0361     def report_unbalanced(self, tag):
0362         if self.verbose:
0363             print '*** Unbalanced </' + tag + '>'
0364             print '*** Stack:', self.stack
0365 
0366     def handle_charref(self, name):
0367         """Handle character reference, no need to override."""
0368         try:
0369             n = int(name)
0370         except ValueError:
0371             self.unknown_charref(name)
0372             return
0373         if not 0 <= n <= 255:
0374             self.unknown_charref(name)
0375             return
0376         self.handle_data(chr(n))
0377 
0378     # Definition of entities -- derived classes may override
0379     entitydefs = \
0380             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
0381 
0382     def handle_entityref(self, name):
0383         """Handle entity references.
0384 
0385         There should be no need to override this method; it can be
0386         tailored by setting up the self.entitydefs mapping appropriately.
0387         """
0388         table = self.entitydefs
0389         if name in table:
0390             self.handle_data(table[name])
0391         else:
0392             self.unknown_entityref(name)
0393             return
0394 
0395     # Example -- handle data, should be overridden
0396     def handle_data(self, data):
0397         pass
0398 
0399     # Example -- handle comment, could be overridden
0400     def handle_comment(self, data):
0401         pass
0402 
0403     # Example -- handle declaration, could be overridden
0404     def handle_decl(self, decl):
0405         pass
0406 
0407     # Example -- handle processing instruction, could be overridden
0408     def handle_pi(self, data):
0409         pass
0410 
0411     # To be overridden -- handlers for unknown objects
0412     def unknown_starttag(self, tag, attrs): pass
0413     def unknown_endtag(self, tag): pass
0414     def unknown_charref(self, ref): pass
0415     def unknown_entityref(self, ref): pass
0416 
0417 
0418 class TestSGMLParser(SGMLParser):
0419 
0420     def __init__(self, verbose=0):
0421         self.testdata = ""
0422         SGMLParser.__init__(self, verbose)
0423 
0424     def handle_data(self, data):
0425         self.testdata = self.testdata + data
0426         if len(repr(self.testdata)) >= 70:
0427             self.flush()
0428 
0429     def flush(self):
0430         data = self.testdata
0431         if data:
0432             self.testdata = ""
0433             print 'data:', repr(data)
0434 
0435     def handle_comment(self, data):
0436         self.flush()
0437         r = repr(data)
0438         if len(r) > 68:
0439             r = r[:32] + '...' + r[-32:]
0440         print 'comment:', r
0441 
0442     def unknown_starttag(self, tag, attrs):
0443         self.flush()
0444         if not attrs:
0445             print 'start tag: <' + tag + '>'
0446         else:
0447             print 'start tag: <' + tag,
0448             for name, value in attrs:
0449                 print name + '=' + '"' + value + '"',
0450             print '>'
0451 
0452     def unknown_endtag(self, tag):
0453         self.flush()
0454         print 'end tag: </' + tag + '>'
0455 
0456     def unknown_entityref(self, ref):
0457         self.flush()
0458         print '*** unknown entity ref: &' + ref + ';'
0459 
0460     def unknown_charref(self, ref):
0461         self.flush()
0462         print '*** unknown char ref: &#' + ref + ';'
0463 
0464     def unknown_decl(self, data):
0465         self.flush()
0466         print '*** unknown decl: [' + data + ']'
0467 
0468     def close(self):
0469         SGMLParser.close(self)
0470         self.flush()
0471 
0472 
0473 def test(args = None):
0474     import sys
0475 
0476     if args is None:
0477         args = sys.argv[1:]
0478 
0479     if args and args[0] == '-s':
0480         args = args[1:]
0481         klass = SGMLParser
0482     else:
0483         klass = TestSGMLParser
0484 
0485     if args:
0486         file = args[0]
0487     else:
0488         file = 'test.html'
0489 
0490     if file == '-':
0491         f = sys.stdin
0492     else:
0493         try:
0494             f = open(file, 'r')
0495         except IOError, msg:
0496             print file, ":", msg
0497             sys.exit(1)
0498 
0499     data = f.read()
0500     if f is not sys.stdin:
0501         f.close()
0502 
0503     x = klass()
0504     for c in data:
0505         x.feed(c)
0506     x.close()
0507 
0508 
0509 if __name__ == '__main__':
0510     test()
0511
Generated by PyXR 0.9.4