PyXR

c:\python24\lib \ test \ test_sgmllib.py



0001 import pprint
0002 import sgmllib
0003 import unittest
0004 from test import test_support
0005 
0006 
0007 class EventCollector(sgmllib.SGMLParser):
0008 
0009     def __init__(self):
0010         self.events = []
0011         self.append = self.events.append
0012         sgmllib.SGMLParser.__init__(self)
0013 
0014     def get_events(self):
0015         # Normalize the list of events so that buffer artefacts don't
0016         # separate runs of contiguous characters.
0017         L = []
0018         prevtype = None
0019         for event in self.events:
0020             type = event[0]
0021             if type == prevtype == "data":
0022                 L[-1] = ("data", L[-1][1] + event[1])
0023             else:
0024                 L.append(event)
0025             prevtype = type
0026         self.events = L
0027         return L
0028 
0029     # structure markup
0030 
0031     def unknown_starttag(self, tag, attrs):
0032         self.append(("starttag", tag, attrs))
0033 
0034     def unknown_endtag(self, tag):
0035         self.append(("endtag", tag))
0036 
0037     # all other markup
0038 
0039     def handle_comment(self, data):
0040         self.append(("comment", data))
0041 
0042     def handle_charref(self, data):
0043         self.append(("charref", data))
0044 
0045     def handle_data(self, data):
0046         self.append(("data", data))
0047 
0048     def handle_decl(self, decl):
0049         self.append(("decl", decl))
0050 
0051     def handle_entityref(self, data):
0052         self.append(("entityref", data))
0053 
0054     def handle_pi(self, data):
0055         self.append(("pi", data))
0056 
0057     def unknown_decl(self, decl):
0058         self.append(("unknown decl", decl))
0059 
0060 
0061 class CDATAEventCollector(EventCollector):
0062     def start_cdata(self, attrs):
0063         self.append(("starttag", "cdata", attrs))
0064         self.setliteral()
0065 
0066 
0067 class SGMLParserTestCase(unittest.TestCase):
0068 
0069     collector = EventCollector
0070 
0071     def get_events(self, source):
0072         parser = self.collector()
0073         try:
0074             for s in source:
0075                 parser.feed(s)
0076             parser.close()
0077         except:
0078             #self.events = parser.events
0079             raise
0080         return parser.get_events()
0081 
0082     def check_events(self, source, expected_events):
0083         try:
0084             events = self.get_events(source)
0085         except:
0086             import sys
0087             #print >>sys.stderr, pprint.pformat(self.events)
0088             raise
0089         if events != expected_events:
0090             self.fail("received events did not match expected events\n"
0091                       "Expected:\n" + pprint.pformat(expected_events) +
0092                       "\nReceived:\n" + pprint.pformat(events))
0093 
0094     def check_parse_error(self, source):
0095         parser = EventCollector()
0096         try:
0097             parser.feed(source)
0098             parser.close()
0099         except sgmllib.SGMLParseError:
0100             pass
0101         else:
0102             self.fail("expected SGMLParseError for %r\nReceived:\n%s"
0103                       % (source, pprint.pformat(parser.get_events())))
0104 
0105     def test_doctype_decl_internal(self):
0106         inside = """\
0107 DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
0108              SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
0109   <!ELEMENT html - O EMPTY>
0110   <!ATTLIST html
0111       version CDATA #IMPLIED
0112       profile CDATA 'DublinCore'>
0113   <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
0114   <!ENTITY myEntity 'internal parsed entity'>
0115   <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
0116   <!ENTITY % paramEntity 'name|name|name'>
0117   %paramEntity;
0118   <!-- comment -->
0119 ]"""
0120         self.check_events(["<!%s>" % inside], [
0121             ("decl", inside),
0122             ])
0123 
0124     def test_doctype_decl_external(self):
0125         inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
0126         self.check_events("<!%s>" % inside, [
0127             ("decl", inside),
0128             ])
0129 
0130     def test_underscore_in_attrname(self):
0131         # SF bug #436621
0132         """Make sure attribute names with underscores are accepted"""
0133         self.check_events("<a has_under _under>", [
0134             ("starttag", "a", [("has_under", "has_under"),
0135                                ("_under", "_under")]),
0136             ])
0137 
0138     def test_underscore_in_tagname(self):
0139         # SF bug #436621
0140         """Make sure tag names with underscores are accepted"""
0141         self.check_events("<has_under></has_under>", [
0142             ("starttag", "has_under", []),
0143             ("endtag", "has_under"),
0144             ])
0145 
0146     def test_quotes_in_unquoted_attrs(self):
0147         # SF bug #436621
0148         """Be sure quotes in unquoted attributes are made part of the value"""
0149         self.check_events("<a href=foo'bar\"baz>", [
0150             ("starttag", "a", [("href", "foo'bar\"baz")]),
0151             ])
0152 
0153     def test_xhtml_empty_tag(self):
0154         """Handling of XHTML-style empty start tags"""
0155         self.check_events("<br />text<i></i>", [
0156             ("starttag", "br", []),
0157             ("data", "text"),
0158             ("starttag", "i", []),
0159             ("endtag", "i"),
0160             ])
0161 
0162     def test_processing_instruction_only(self):
0163         self.check_events("<?processing instruction>", [
0164             ("pi", "processing instruction"),
0165             ])
0166 
0167     def test_bad_nesting(self):
0168         self.check_events("<a><b></a></b>", [
0169             ("starttag", "a", []),
0170             ("starttag", "b", []),
0171             ("endtag", "a"),
0172             ("endtag", "b"),
0173             ])
0174 
0175     def test_bare_ampersands(self):
0176         self.check_events("this text & contains & ampersands &", [
0177             ("data", "this text & contains & ampersands &"),
0178             ])
0179 
0180     def test_bare_pointy_brackets(self):
0181         self.check_events("this < text > contains < bare>pointy< brackets", [
0182             ("data", "this < text > contains < bare>pointy< brackets"),
0183             ])
0184 
0185     def test_attr_syntax(self):
0186         output = [
0187           ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
0188           ]
0189         self.check_events("""<a b='v' c="v" d=v e>""", output)
0190         self.check_events("""<a  b = 'v' c = "v" d = v e>""", output)
0191         self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
0192         self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
0193 
0194     def test_attr_values(self):
0195         self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
0196                         [("starttag", "a", [("b", "xxx\n\txxx"),
0197                                             ("c", "yyy\t\nyyy"),
0198                                             ("d", "\txyz\n")])
0199                          ])
0200         self.check_events("""<a b='' c="">""", [
0201             ("starttag", "a", [("b", ""), ("c", "")]),
0202             ])
0203         # URL construction stuff from RFC 1808:
0204         safe = "$-_.+"
0205         extra = "!*'(),"
0206         reserved = ";/?:@&="
0207         url = "http://example.com:8080/path/to/file?%s%s%s" % (
0208             safe, extra, reserved)
0209         self.check_events("""<e a=%s>""" % url, [
0210             ("starttag", "e", [("a", url)]),
0211             ])
0212         # Regression test for SF patch #669683.
0213         self.check_events("<e a=rgb(1,2,3)>", [
0214             ("starttag", "e", [("a", "rgb(1,2,3)")]),
0215             ])
0216 
0217     def test_attr_funky_names(self):
0218         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
0219             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
0220             ])
0221 
0222     def test_illegal_declarations(self):
0223         s = 'abc<!spacer type="block" height="25">def'
0224         self.check_events(s, [
0225             ("data", "abc"),
0226             ("unknown decl", 'spacer type="block" height="25"'),
0227             ("data", "def"),
0228             ])
0229 
0230     def test_weird_starttags(self):
0231         self.check_events("<a<a>", [
0232             ("starttag", "a", []),
0233             ("starttag", "a", []),
0234             ])
0235         self.check_events("</a<a>", [
0236             ("endtag", "a"),
0237             ("starttag", "a", []),
0238             ])
0239 
0240     def test_declaration_junk_chars(self):
0241         self.check_parse_error("<!DOCTYPE foo $ >")
0242 
0243     def test_get_starttag_text(self):
0244         s = """<foobar   \n   one="1"\ttwo=2   >"""
0245         self.check_events(s, [
0246             ("starttag", "foobar", [("one", "1"), ("two", "2")]),
0247             ])
0248 
0249     def test_cdata_content(self):
0250         s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
0251              "<notcdata> <!-- comment --> </notcdata>")
0252         self.collector = CDATAEventCollector
0253         self.check_events(s, [
0254             ("starttag", "cdata", []),
0255             ("data", " <!-- not a comment --> &not-an-entity-ref; "),
0256             ("endtag", "cdata"),
0257             ("starttag", "notcdata", []),
0258             ("data", " "),
0259             ("comment", " comment "),
0260             ("data", " "),
0261             ("endtag", "notcdata"),
0262             ])
0263         s = """<cdata> <not a='start tag'> </cdata>"""
0264         self.check_events(s, [
0265             ("starttag", "cdata", []),
0266             ("data", " <not a='start tag'> "),
0267             ("endtag", "cdata"),
0268             ])
0269 
0270     def test_illegal_declarations(self):
0271         s = 'abc<!spacer type="block" height="25">def'
0272         self.check_events(s, [
0273             ("data", "abc"),
0274             ("unknown decl", 'spacer type="block" height="25"'),
0275             ("data", "def"),
0276             ])
0277 
0278     def test_enumerated_attr_type(self):
0279         s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
0280         self.check_events(s, [
0281             ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
0282             ])
0283 
0284     # XXX These tests have been disabled by prefixing their names with
0285     # an underscore.  The first two exercise outstanding bugs in the
0286     # sgmllib module, and the third exhibits questionable behavior
0287     # that needs to be carefully considered before changing it.
0288 
0289     def _test_starttag_end_boundary(self):
0290         self.check_events("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
0291         self.check_events("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
0292 
0293     def _test_buffer_artefacts(self):
0294         output = [("starttag", "a", [("b", "<")])]
0295         self.check_events(["<a b='<'>"], output)
0296         self.check_events(["<a ", "b='<'>"], output)
0297         self.check_events(["<a b", "='<'>"], output)
0298         self.check_events(["<a b=", "'<'>"], output)
0299         self.check_events(["<a b='<", "'>"], output)
0300         self.check_events(["<a b='<'", ">"], output)
0301 
0302         output = [("starttag", "a", [("b", ">")])]
0303         self.check_events(["<a b='>'>"], output)
0304         self.check_events(["<a ", "b='>'>"], output)
0305         self.check_events(["<a b", "='>'>"], output)
0306         self.check_events(["<a b=", "'>'>"], output)
0307         self.check_events(["<a b='>", "'>"], output)
0308         self.check_events(["<a b='>'", ">"], output)
0309 
0310         output = [("comment", "abc")]
0311         self._run_check(["", "<!--abc-->"], output)
0312         self._run_check(["<", "!--abc-->"], output)
0313         self._run_check(["<!", "--abc-->"], output)
0314         self._run_check(["<!-", "-abc-->"], output)
0315         self._run_check(["<!--", "abc-->"], output)
0316         self._run_check(["<!--a", "bc-->"], output)
0317         self._run_check(["<!--ab", "c-->"], output)
0318         self._run_check(["<!--abc", "-->"], output)
0319         self._run_check(["<!--abc-", "->"], output)
0320         self._run_check(["<!--abc--", ">"], output)
0321         self._run_check(["<!--abc-->", ""], output)
0322 
0323     def _test_starttag_junk_chars(self):
0324         self.check_parse_error("<")
0325         self.check_parse_error("<>")
0326         self.check_parse_error("</$>")
0327         self.check_parse_error("</")
0328         self.check_parse_error("</a")
0329         self.check_parse_error("<$")
0330         self.check_parse_error("<$>")
0331         self.check_parse_error("<!")
0332         self.check_parse_error("<a $>")
0333         self.check_parse_error("<a")
0334         self.check_parse_error("<a foo='bar'")
0335         self.check_parse_error("<a foo='bar")
0336         self.check_parse_error("<a foo='>'")
0337         self.check_parse_error("<a foo='>")
0338         self.check_parse_error("<a foo=>")
0339 
0340 
0341 def test_main():
0342     test_support.run_unittest(SGMLParserTestCase)
0343 
0344 
0345 if __name__ == "__main__":
0346     test_main()
0347 

Generated by PyXR 0.9.4
SourceForge.net Logo