0001 import pprint 0002 import sgmllib 0003 import unittest 0004 from test import test_support 0005 0006 0007 class EventCollector(sgmllib.SGMLParser): 0008 0009 def __init__(self): 0010 self.events = [] 0011 self.append = self.events.append 0012 sgmllib.SGMLParser.__init__(self) 0013 0014 def get_events(self): 0015 # Normalize the list of events so that buffer artefacts don't 0016 # separate runs of contiguous characters. 0017 L = [] 0018 prevtype = None 0019 for event in self.events: 0020 type = event[0] 0021 if type == prevtype == "data": 0022 L[-1] = ("data", L[-1][1] + event[1]) 0023 else: 0024 L.append(event) 0025 prevtype = type 0026 self.events = L 0027 return L 0028 0029 # structure markup 0030 0031 def unknown_starttag(self, tag, attrs): 0032 self.append(("starttag", tag, attrs)) 0033 0034 def unknown_endtag(self, tag): 0035 self.append(("endtag", tag)) 0036 0037 # all other markup 0038 0039 def handle_comment(self, data): 0040 self.append(("comment", data)) 0041 0042 def handle_charref(self, data): 0043 self.append(("charref", data)) 0044 0045 def handle_data(self, data): 0046 self.append(("data", data)) 0047 0048 def handle_decl(self, decl): 0049 self.append(("decl", decl)) 0050 0051 def handle_entityref(self, data): 0052 self.append(("entityref", data)) 0053 0054 def handle_pi(self, data): 0055 self.append(("pi", data)) 0056 0057 def unknown_decl(self, decl): 0058 self.append(("unknown decl", decl)) 0059 0060 0061 class CDATAEventCollector(EventCollector): 0062 def start_cdata(self, attrs): 0063 self.append(("starttag", "cdata", attrs)) 0064 self.setliteral() 0065 0066 0067 class SGMLParserTestCase(unittest.TestCase): 0068 0069 collector = EventCollector 0070 0071 def get_events(self, source): 0072 parser = self.collector() 0073 try: 0074 for s in source: 0075 parser.feed(s) 0076 parser.close() 0077 except: 0078 #self.events = parser.events 0079 raise 0080 return parser.get_events() 0081 0082 def check_events(self, source, expected_events): 0083 try: 0084 events = self.get_events(source) 0085 except: 0086 import sys 0087 #print >>sys.stderr, pprint.pformat(self.events) 0088 raise 0089 if events != expected_events: 0090 self.fail("received events did not match expected events\n" 0091 "Expected:\n" + pprint.pformat(expected_events) + 0092 "\nReceived:\n" + pprint.pformat(events)) 0093 0094 def check_parse_error(self, source): 0095 parser = EventCollector() 0096 try: 0097 parser.feed(source) 0098 parser.close() 0099 except sgmllib.SGMLParseError: 0100 pass 0101 else: 0102 self.fail("expected SGMLParseError for %r\nReceived:\n%s" 0103 % (source, pprint.pformat(parser.get_events()))) 0104 0105 def test_doctype_decl_internal(self): 0106 inside = """\ 0107 DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' 0108 SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [ 0109 <!ELEMENT html - O EMPTY> 0110 <!ATTLIST html 0111 version CDATA #IMPLIED 0112 profile CDATA 'DublinCore'> 0113 <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'> 0114 <!ENTITY myEntity 'internal parsed entity'> 0115 <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'> 0116 <!ENTITY % paramEntity 'name|name|name'> 0117 %paramEntity; 0118 <!-- comment --> 0119 ]""" 0120 self.check_events(["<!%s>" % inside], [ 0121 ("decl", inside), 0122 ]) 0123 0124 def test_doctype_decl_external(self): 0125 inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'" 0126 self.check_events("<!%s>" % inside, [ 0127 ("decl", inside), 0128 ]) 0129 0130 def test_underscore_in_attrname(self): 0131 # SF bug #436621 0132 """Make sure attribute names with underscores are accepted""" 0133 self.check_events("<a has_under _under>", [ 0134 ("starttag", "a", [("has_under", "has_under"), 0135 ("_under", "_under")]), 0136 ]) 0137 0138 def test_underscore_in_tagname(self): 0139 # SF bug #436621 0140 """Make sure tag names with underscores are accepted""" 0141 self.check_events("<has_under></has_under>", [ 0142 ("starttag", "has_under", []), 0143 ("endtag", "has_under"), 0144 ]) 0145 0146 def test_quotes_in_unquoted_attrs(self): 0147 # SF bug #436621 0148 """Be sure quotes in unquoted attributes are made part of the value""" 0149 self.check_events("<a href=foo'bar\"baz>", [ 0150 ("starttag", "a", [("href", "foo'bar\"baz")]), 0151 ]) 0152 0153 def test_xhtml_empty_tag(self): 0154 """Handling of XHTML-style empty start tags""" 0155 self.check_events("<br />text<i></i>", [ 0156 ("starttag", "br", []), 0157 ("data", "text"), 0158 ("starttag", "i", []), 0159 ("endtag", "i"), 0160 ]) 0161 0162 def test_processing_instruction_only(self): 0163 self.check_events("<?processing instruction>", [ 0164 ("pi", "processing instruction"), 0165 ]) 0166 0167 def test_bad_nesting(self): 0168 self.check_events("<a><b></a></b>", [ 0169 ("starttag", "a", []), 0170 ("starttag", "b", []), 0171 ("endtag", "a"), 0172 ("endtag", "b"), 0173 ]) 0174 0175 def test_bare_ampersands(self): 0176 self.check_events("this text & contains & ampersands &", [ 0177 ("data", "this text & contains & ampersands &"), 0178 ]) 0179 0180 def test_bare_pointy_brackets(self): 0181 self.check_events("this < text > contains < bare>pointy< brackets", [ 0182 ("data", "this < text > contains < bare>pointy< brackets"), 0183 ]) 0184 0185 def test_attr_syntax(self): 0186 output = [ 0187 ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")]) 0188 ] 0189 self.check_events("""<a b='v' c="v" d=v e>""", output) 0190 self.check_events("""<a b = 'v' c = "v" d = v e>""", output) 0191 self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output) 0192 self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output) 0193 0194 def test_attr_values(self): 0195 self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""", 0196 [("starttag", "a", [("b", "xxx\n\txxx"), 0197 ("c", "yyy\t\nyyy"), 0198 ("d", "\txyz\n")]) 0199 ]) 0200 self.check_events("""<a b='' c="">""", [ 0201 ("starttag", "a", [("b", ""), ("c", "")]), 0202 ]) 0203 # URL construction stuff from RFC 1808: 0204 safe = "$-_.+" 0205 extra = "!*'()," 0206 reserved = ";/?:@&=" 0207 url = "http://example.com:8080/path/to/file?%s%s%s" % ( 0208 safe, extra, reserved) 0209 self.check_events("""<e a=%s>""" % url, [ 0210 ("starttag", "e", [("a", url)]), 0211 ]) 0212 # Regression test for SF patch #669683. 0213 self.check_events("<e a=rgb(1,2,3)>", [ 0214 ("starttag", "e", [("a", "rgb(1,2,3)")]), 0215 ]) 0216 0217 def test_attr_funky_names(self): 0218 self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ 0219 ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), 0220 ]) 0221 0222 def test_illegal_declarations(self): 0223 s = 'abc<!spacer type="block" height="25">def' 0224 self.check_events(s, [ 0225 ("data", "abc"), 0226 ("unknown decl", 'spacer type="block" height="25"'), 0227 ("data", "def"), 0228 ]) 0229 0230 def test_weird_starttags(self): 0231 self.check_events("<a<a>", [ 0232 ("starttag", "a", []), 0233 ("starttag", "a", []), 0234 ]) 0235 self.check_events("</a<a>", [ 0236 ("endtag", "a"), 0237 ("starttag", "a", []), 0238 ]) 0239 0240 def test_declaration_junk_chars(self): 0241 self.check_parse_error("<!DOCTYPE foo $ >") 0242 0243 def test_get_starttag_text(self): 0244 s = """<foobar \n one="1"\ttwo=2 >""" 0245 self.check_events(s, [ 0246 ("starttag", "foobar", [("one", "1"), ("two", "2")]), 0247 ]) 0248 0249 def test_cdata_content(self): 0250 s = ("<cdata> <!-- not a comment --> ¬-an-entity-ref; </cdata>" 0251 "<notcdata> <!-- comment --> </notcdata>") 0252 self.collector = CDATAEventCollector 0253 self.check_events(s, [ 0254 ("starttag", "cdata", []), 0255 ("data", " <!-- not a comment --> ¬-an-entity-ref; "), 0256 ("endtag", "cdata"), 0257 ("starttag", "notcdata", []), 0258 ("data", " "), 0259 ("comment", " comment "), 0260 ("data", " "), 0261 ("endtag", "notcdata"), 0262 ]) 0263 s = """<cdata> <not a='start tag'> </cdata>""" 0264 self.check_events(s, [ 0265 ("starttag", "cdata", []), 0266 ("data", " <not a='start tag'> "), 0267 ("endtag", "cdata"), 0268 ]) 0269 0270 def test_illegal_declarations(self): 0271 s = 'abc<!spacer type="block" height="25">def' 0272 self.check_events(s, [ 0273 ("data", "abc"), 0274 ("unknown decl", 'spacer type="block" height="25"'), 0275 ("data", "def"), 0276 ]) 0277 0278 def test_enumerated_attr_type(self): 0279 s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>" 0280 self.check_events(s, [ 0281 ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'), 0282 ]) 0283 0284 # XXX These tests have been disabled by prefixing their names with 0285 # an underscore. The first two exercise outstanding bugs in the 0286 # sgmllib module, and the third exhibits questionable behavior 0287 # that needs to be carefully considered before changing it. 0288 0289 def _test_starttag_end_boundary(self): 0290 self.check_events("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) 0291 self.check_events("""<a b='>'>""", [("starttag", "a", [("b", ">")])]) 0292 0293 def _test_buffer_artefacts(self): 0294 output = [("starttag", "a", [("b", "<")])] 0295 self.check_events(["<a b='<'>"], output) 0296 self.check_events(["<a ", "b='<'>"], output) 0297 self.check_events(["<a b", "='<'>"], output) 0298 self.check_events(["<a b=", "'<'>"], output) 0299 self.check_events(["<a b='<", "'>"], output) 0300 self.check_events(["<a b='<'", ">"], output) 0301 0302 output = [("starttag", "a", [("b", ">")])] 0303 self.check_events(["<a b='>'>"], output) 0304 self.check_events(["<a ", "b='>'>"], output) 0305 self.check_events(["<a b", "='>'>"], output) 0306 self.check_events(["<a b=", "'>'>"], output) 0307 self.check_events(["<a b='>", "'>"], output) 0308 self.check_events(["<a b='>'", ">"], output) 0309 0310 output = [("comment", "abc")] 0311 self._run_check(["", "<!--abc-->"], output) 0312 self._run_check(["<", "!--abc-->"], output) 0313 self._run_check(["<!", "--abc-->"], output) 0314 self._run_check(["<!-", "-abc-->"], output) 0315 self._run_check(["<!--", "abc-->"], output) 0316 self._run_check(["<!--a", "bc-->"], output) 0317 self._run_check(["<!--ab", "c-->"], output) 0318 self._run_check(["<!--abc", "-->"], output) 0319 self._run_check(["<!--abc-", "->"], output) 0320 self._run_check(["<!--abc--", ">"], output) 0321 self._run_check(["<!--abc-->", ""], output) 0322 0323 def _test_starttag_junk_chars(self): 0324 self.check_parse_error("<") 0325 self.check_parse_error("<>") 0326 self.check_parse_error("</$>") 0327 self.check_parse_error("</") 0328 self.check_parse_error("</a") 0329 self.check_parse_error("<$") 0330 self.check_parse_error("<$>") 0331 self.check_parse_error("<!") 0332 self.check_parse_error("<a $>") 0333 self.check_parse_error("<a") 0334 self.check_parse_error("<a foo='bar'") 0335 self.check_parse_error("<a foo='bar") 0336 self.check_parse_error("<a foo='>'") 0337 self.check_parse_error("<a foo='>") 0338 self.check_parse_error("<a foo=>") 0339 0340 0341 def test_main(): 0342 test_support.run_unittest(SGMLParserTestCase) 0343 0344 0345 if __name__ == "__main__": 0346 test_main() 0347
Generated by PyXR 0.9.4