0001 """Tests for HTMLParser.py.""" 0002 0003 import HTMLParser 0004 import pprint 0005 import sys 0006 import unittest 0007 from test import test_support 0008 0009 0010 class EventCollector(HTMLParser.HTMLParser): 0011 0012 def __init__(self): 0013 self.events = [] 0014 self.append = self.events.append 0015 HTMLParser.HTMLParser.__init__(self) 0016 0017 def get_events(self): 0018 # Normalize the list of events so that buffer artefacts don't 0019 # separate runs of contiguous characters. 0020 L = [] 0021 prevtype = None 0022 for event in self.events: 0023 type = event[0] 0024 if type == prevtype == "data": 0025 L[-1] = ("data", L[-1][1] + event[1]) 0026 else: 0027 L.append(event) 0028 prevtype = type 0029 self.events = L 0030 return L 0031 0032 # structure markup 0033 0034 def handle_starttag(self, tag, attrs): 0035 self.append(("starttag", tag, attrs)) 0036 0037 def handle_startendtag(self, tag, attrs): 0038 self.append(("startendtag", tag, attrs)) 0039 0040 def handle_endtag(self, tag): 0041 self.append(("endtag", tag)) 0042 0043 # all other markup 0044 0045 def handle_comment(self, data): 0046 self.append(("comment", data)) 0047 0048 def handle_charref(self, data): 0049 self.append(("charref", data)) 0050 0051 def handle_data(self, data): 0052 self.append(("data", data)) 0053 0054 def handle_decl(self, data): 0055 self.append(("decl", data)) 0056 0057 def handle_entityref(self, data): 0058 self.append(("entityref", data)) 0059 0060 def handle_pi(self, data): 0061 self.append(("pi", data)) 0062 0063 def unknown_decl(self, decl): 0064 self.append(("unknown decl", decl)) 0065 0066 0067 class EventCollectorExtra(EventCollector): 0068 0069 def handle_starttag(self, tag, attrs): 0070 EventCollector.handle_starttag(self, tag, attrs) 0071 self.append(("starttag_text", self.get_starttag_text())) 0072 0073 0074 class TestCaseBase(unittest.TestCase): 0075 0076 def _run_check(self, source, expected_events, collector=EventCollector): 0077 parser = collector() 0078 for s in source: 0079 parser.feed(s) 0080 parser.close() 0081 events = parser.get_events() 0082 if events != expected_events: 0083 self.fail("received events did not match expected events\n" 0084 "Expected:\n" + pprint.pformat(expected_events) + 0085 "\nReceived:\n" + pprint.pformat(events)) 0086 0087 def _run_check_extra(self, source, events): 0088 self._run_check(source, events, EventCollectorExtra) 0089 0090 def _parse_error(self, source): 0091 def parse(source=source): 0092 parser = HTMLParser.HTMLParser() 0093 parser.feed(source) 0094 parser.close() 0095 self.assertRaises(HTMLParser.HTMLParseError, parse) 0096 0097 0098 class HTMLParserTestCase(TestCaseBase): 0099 0100 def test_processing_instruction_only(self): 0101 self._run_check("<?processing instruction>", [ 0102 ("pi", "processing instruction"), 0103 ]) 0104 self._run_check("<?processing instruction ?>", [ 0105 ("pi", "processing instruction ?"), 0106 ]) 0107 0108 def test_simple_html(self): 0109 self._run_check(""" 0110 <!DOCTYPE html PUBLIC 'foo'> 0111 <HTML>&entity;  0112 <!--comment1a 0113 -></foo><bar><<?pi?></foo<bar 0114 comment1b--> 0115 <Img sRc='Bar' isMAP>sample 0116 text 0117 “ 0118 <!--comment2a-- --comment2b--> 0119 </Html> 0120 """, [ 0121 ("data", "\n"), 0122 ("decl", "DOCTYPE html PUBLIC 'foo'"), 0123 ("data", "\n"), 0124 ("starttag", "html", []), 0125 ("entityref", "entity"), 0126 ("charref", "32"), 0127 ("data", "\n"), 0128 ("comment", "comment1a\n-></foo><bar><<?pi?></foo<bar\ncomment1b"), 0129 ("data", "\n"), 0130 ("starttag", "img", [("src", "Bar"), ("ismap", None)]), 0131 ("data", "sample\ntext\n"), 0132 ("charref", "x201C"), 0133 ("data", "\n"), 0134 ("comment", "comment2a-- --comment2b"), 0135 ("data", "\n"), 0136 ("endtag", "html"), 0137 ("data", "\n"), 0138 ]) 0139 0140 def test_unclosed_entityref(self): 0141 self._run_check("&entityref foo", [ 0142 ("entityref", "entityref"), 0143 ("data", " foo"), 0144 ]) 0145 0146 def test_doctype_decl(self): 0147 inside = """\ 0148 DOCTYPE html [ 0149 <!ELEMENT html - O EMPTY> 0150 <!ATTLIST html 0151 version CDATA #IMPLIED 0152 profile CDATA 'DublinCore'> 0153 <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'> 0154 <!ENTITY myEntity 'internal parsed entity'> 0155 <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'> 0156 <!ENTITY % paramEntity 'name|name|name'> 0157 %paramEntity; 0158 <!-- comment --> 0159 ]""" 0160 self._run_check("<!%s>" % inside, [ 0161 ("decl", inside), 0162 ]) 0163 0164 def test_bad_nesting(self): 0165 # Strangely, this *is* supposed to test that overlapping 0166 # elements are allowed. HTMLParser is more geared toward 0167 # lexing the input that parsing the structure. 0168 self._run_check("<a><b></a></b>", [ 0169 ("starttag", "a", []), 0170 ("starttag", "b", []), 0171 ("endtag", "a"), 0172 ("endtag", "b"), 0173 ]) 0174 0175 def test_bare_ampersands(self): 0176 self._run_check("this text & contains & ampersands &", [ 0177 ("data", "this text & contains & ampersands &"), 0178 ]) 0179 0180 def test_bare_pointy_brackets(self): 0181 self._run_check("this < text > contains < bare>pointy< brackets", [ 0182 ("data", "this < text > contains < bare>pointy< brackets"), 0183 ]) 0184 0185 def test_attr_syntax(self): 0186 output = [ 0187 ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) 0188 ] 0189 self._run_check("""<a b='v' c="v" d=v e>""", output) 0190 self._run_check("""<a b = 'v' c = "v" d = v e>""", output) 0191 self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output) 0192 self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output) 0193 0194 def test_attr_values(self): 0195 self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""", 0196 [("starttag", "a", [("b", "xxx\n\txxx"), 0197 ("c", "yyy\t\nyyy"), 0198 ("d", "\txyz\n")]) 0199 ]) 0200 self._run_check("""<a b='' c="">""", [ 0201 ("starttag", "a", [("b", ""), ("c", "")]), 0202 ]) 0203 # Regression test for SF patch #669683. 0204 self._run_check("<e a=rgb(1,2,3)>", [ 0205 ("starttag", "e", [("a", "rgb(1,2,3)")]), 0206 ]) 0207 # Regression test for SF bug #921657. 0208 self._run_check("<a href=mailto:xyz@example.com>", [ 0209 ("starttag", "a", [("href", "mailto:xyz@example.com")]), 0210 ]) 0211 0212 def test_attr_entity_replacement(self): 0213 self._run_check("""<a b='&><"''>""", [ 0214 ("starttag", "a", [("b", "&><\"'")]), 0215 ]) 0216 0217 def test_attr_funky_names(self): 0218 self._run_check("""<a a.b='v' c:d=v e-f=v>""", [ 0219 ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), 0220 ]) 0221 0222 def test_illegal_declarations(self): 0223 self._parse_error('<!spacer type="block" height="25">') 0224 0225 def test_starttag_end_boundary(self): 0226 self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) 0227 self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])]) 0228 0229 def test_buffer_artefacts(self): 0230 output = [("starttag", "a", [("b", "<")])] 0231 self._run_check(["<a b='<'>"], output) 0232 self._run_check(["<a ", "b='<'>"], output) 0233 self._run_check(["<a b", "='<'>"], output) 0234 self._run_check(["<a b=", "'<'>"], output) 0235 self._run_check(["<a b='<", "'>"], output) 0236 self._run_check(["<a b='<'", ">"], output) 0237 0238 output = [("starttag", "a", [("b", ">")])] 0239 self._run_check(["<a b='>'>"], output) 0240 self._run_check(["<a ", "b='>'>"], output) 0241 self._run_check(["<a b", "='>'>"], output) 0242 self._run_check(["<a b=", "'>'>"], output) 0243 self._run_check(["<a b='>", "'>"], output) 0244 self._run_check(["<a b='>'", ">"], output) 0245 0246 output = [("comment", "abc")] 0247 self._run_check(["", "<!--abc-->"], output) 0248 self._run_check(["<", "!--abc-->"], output) 0249 self._run_check(["<!", "--abc-->"], output) 0250 self._run_check(["<!-", "-abc-->"], output) 0251 self._run_check(["<!--", "abc-->"], output) 0252 self._run_check(["<!--a", "bc-->"], output) 0253 self._run_check(["<!--ab", "c-->"], output) 0254 self._run_check(["<!--abc", "-->"], output) 0255 self._run_check(["<!--abc-", "->"], output) 0256 self._run_check(["<!--abc--", ">"], output) 0257 self._run_check(["<!--abc-->", ""], output) 0258 0259 def test_starttag_junk_chars(self): 0260 self._parse_error("</>") 0261 self._parse_error("</$>") 0262 self._parse_error("</") 0263 self._parse_error("</a") 0264 self._parse_error("<a<a>") 0265 self._parse_error("</a<a>") 0266 self._parse_error("<!") 0267 self._parse_error("<a $>") 0268 self._parse_error("<a") 0269 self._parse_error("<a foo='bar'") 0270 self._parse_error("<a foo='bar") 0271 self._parse_error("<a foo='>'") 0272 self._parse_error("<a foo='>") 0273 self._parse_error("<a foo=>") 0274 0275 def test_declaration_junk_chars(self): 0276 self._parse_error("<!DOCTYPE foo $ >") 0277 0278 def test_startendtag(self): 0279 self._run_check("<p/>", [ 0280 ("startendtag", "p", []), 0281 ]) 0282 self._run_check("<p></p>", [ 0283 ("starttag", "p", []), 0284 ("endtag", "p"), 0285 ]) 0286 self._run_check("<p><img src='foo' /></p>", [ 0287 ("starttag", "p", []), 0288 ("startendtag", "img", [("src", "foo")]), 0289 ("endtag", "p"), 0290 ]) 0291 0292 def test_get_starttag_text(self): 0293 s = """<foo:bar \n one="1"\ttwo=2 >""" 0294 self._run_check_extra(s, [ 0295 ("starttag", "foo:bar", [("one", "1"), ("two", "2")]), 0296 ("starttag_text", s)]) 0297 0298 def test_cdata_content(self): 0299 s = """<script> <!-- not a comment --> ¬-an-entity-ref; </script>""" 0300 self._run_check(s, [ 0301 ("starttag", "script", []), 0302 ("data", " <!-- not a comment --> ¬-an-entity-ref; "), 0303 ("endtag", "script"), 0304 ]) 0305 s = """<script> <not a='start tag'> </script>""" 0306 self._run_check(s, [ 0307 ("starttag", "script", []), 0308 ("data", " <not a='start tag'> "), 0309 ("endtag", "script"), 0310 ]) 0311 0312 0313 def test_main(): 0314 test_support.run_unittest(HTMLParserTestCase) 0315 0316 0317 if __name__ == "__main__": 0318 test_main() 0319
Generated by PyXR 0.9.4