PyXR

c:\python24\lib \ test \ test_htmlparser.py


0001 """Tests for HTMLParser.py."""
0002 
0003 import HTMLParser
0004 import pprint
0005 import sys
0006 import unittest
0007 from test import test_support
0008 
0009 
0010 class EventCollector(HTMLParser.HTMLParser):
0011 
0012     def __init__(self):
0013         self.events = []
0014         self.append = self.events.append
0015         HTMLParser.HTMLParser.__init__(self)
0016 
0017     def get_events(self):
0018         # Normalize the list of events so that buffer artefacts don't
0019         # separate runs of contiguous characters.
0020         L = []
0021         prevtype = None
0022         for event in self.events:
0023             type = event[0]
0024             if type == prevtype == "data":
0025                 L[-1] = ("data", L[-1][1] + event[1])
0026             else:
0027                 L.append(event)
0028             prevtype = type
0029         self.events = L
0030         return L
0031 
0032     # structure markup
0033 
0034     def handle_starttag(self, tag, attrs):
0035         self.append(("starttag", tag, attrs))
0036 
0037     def handle_startendtag(self, tag, attrs):
0038         self.append(("startendtag", tag, attrs))
0039 
0040     def handle_endtag(self, tag):
0041         self.append(("endtag", tag))
0042 
0043     # all other markup
0044 
0045     def handle_comment(self, data):
0046         self.append(("comment", data))
0047 
0048     def handle_charref(self, data):
0049         self.append(("charref", data))
0050 
0051     def handle_data(self, data):
0052         self.append(("data", data))
0053 
0054     def handle_decl(self, data):
0055         self.append(("decl", data))
0056 
0057     def handle_entityref(self, data):
0058         self.append(("entityref", data))
0059 
0060     def handle_pi(self, data):
0061         self.append(("pi", data))
0062 
0063     def unknown_decl(self, decl):
0064         self.append(("unknown decl", decl))
0065 
0066 
0067 class EventCollectorExtra(EventCollector):
0068 
0069     def handle_starttag(self, tag, attrs):
0070         EventCollector.handle_starttag(self, tag, attrs)
0071         self.append(("starttag_text", self.get_starttag_text()))
0072 
0073 
0074 class TestCaseBase(unittest.TestCase):
0075 
0076     def _run_check(self, source, expected_events, collector=EventCollector):
0077         parser = collector()
0078         for s in source:
0079             parser.feed(s)
0080         parser.close()
0081         events = parser.get_events()
0082         if events != expected_events:
0083             self.fail("received events did not match expected events\n"
0084                       "Expected:\n" + pprint.pformat(expected_events) +
0085                       "\nReceived:\n" + pprint.pformat(events))
0086 
0087     def _run_check_extra(self, source, events):
0088         self._run_check(source, events, EventCollectorExtra)
0089 
0090     def _parse_error(self, source):
0091         def parse(source=source):
0092             parser = HTMLParser.HTMLParser()
0093             parser.feed(source)
0094             parser.close()
0095         self.assertRaises(HTMLParser.HTMLParseError, parse)
0096 
0097 
0098 class HTMLParserTestCase(TestCaseBase):
0099 
0100     def test_processing_instruction_only(self):
0101         self._run_check("<?processing instruction>", [
0102             ("pi", "processing instruction"),
0103             ])
0104         self._run_check("<?processing instruction ?>", [
0105             ("pi", "processing instruction ?"),
0106             ])
0107 
0108     def test_simple_html(self):
0109         self._run_check("""
0110 <!DOCTYPE html PUBLIC 'foo'>
0111 <HTML>&entity;&#32;
0112 <!--comment1a
0113 -></foo><bar>&lt;<?pi?></foo<bar
0114 comment1b-->
0115 <Img sRc='Bar' isMAP>sample
0116 text
0117 &#x201C;
0118 <!--comment2a-- --comment2b-->
0119 </Html>
0120 """, [
0121     ("data", "\n"),
0122     ("decl", "DOCTYPE html PUBLIC 'foo'"),
0123     ("data", "\n"),
0124     ("starttag", "html", []),
0125     ("entityref", "entity"),
0126     ("charref", "32"),
0127     ("data", "\n"),
0128     ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
0129     ("data", "\n"),
0130     ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
0131     ("data", "sample\ntext\n"),
0132     ("charref", "x201C"),
0133     ("data", "\n"),
0134     ("comment", "comment2a-- --comment2b"),
0135     ("data", "\n"),
0136     ("endtag", "html"),
0137     ("data", "\n"),
0138     ])
0139 
0140     def test_unclosed_entityref(self):
0141         self._run_check("&entityref foo", [
0142             ("entityref", "entityref"),
0143             ("data", " foo"),
0144             ])
0145 
0146     def test_doctype_decl(self):
0147         inside = """\
0148 DOCTYPE html [
0149   <!ELEMENT html - O EMPTY>
0150   <!ATTLIST html
0151       version CDATA #IMPLIED
0152       profile CDATA 'DublinCore'>
0153   <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
0154   <!ENTITY myEntity 'internal parsed entity'>
0155   <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
0156   <!ENTITY % paramEntity 'name|name|name'>
0157   %paramEntity;
0158   <!-- comment -->
0159 ]"""
0160         self._run_check("<!%s>" % inside, [
0161             ("decl", inside),
0162             ])
0163 
0164     def test_bad_nesting(self):
0165         # Strangely, this *is* supposed to test that overlapping
0166         # elements are allowed.  HTMLParser is more geared toward
0167         # lexing the input that parsing the structure.
0168         self._run_check("<a><b></a></b>", [
0169             ("starttag", "a", []),
0170             ("starttag", "b", []),
0171             ("endtag", "a"),
0172             ("endtag", "b"),
0173             ])
0174 
0175     def test_bare_ampersands(self):
0176         self._run_check("this text & contains & ampersands &", [
0177             ("data", "this text & contains & ampersands &"),
0178             ])
0179 
0180     def test_bare_pointy_brackets(self):
0181         self._run_check("this < text > contains < bare>pointy< brackets", [
0182             ("data", "this < text > contains < bare>pointy< brackets"),
0183             ])
0184 
0185     def test_attr_syntax(self):
0186         output = [
0187           ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
0188           ]
0189         self._run_check("""<a b='v' c="v" d=v e>""", output)
0190         self._run_check("""<a  b = 'v' c = "v" d = v e>""", output)
0191         self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
0192         self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
0193 
0194     def test_attr_values(self):
0195         self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
0196                         [("starttag", "a", [("b", "xxx\n\txxx"),
0197                                             ("c", "yyy\t\nyyy"),
0198                                             ("d", "\txyz\n")])
0199                          ])
0200         self._run_check("""<a b='' c="">""", [
0201             ("starttag", "a", [("b", ""), ("c", "")]),
0202             ])
0203         # Regression test for SF patch #669683.
0204         self._run_check("<e a=rgb(1,2,3)>", [
0205             ("starttag", "e", [("a", "rgb(1,2,3)")]),
0206             ])
0207         # Regression test for SF bug #921657.
0208         self._run_check("<a href=mailto:xyz@example.com>", [
0209             ("starttag", "a", [("href", "mailto:xyz@example.com")]),
0210             ])
0211 
0212     def test_attr_entity_replacement(self):
0213         self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
0214             ("starttag", "a", [("b", "&><\"'")]),
0215             ])
0216 
0217     def test_attr_funky_names(self):
0218         self._run_check("""<a a.b='v' c:d=v e-f=v>""", [
0219             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
0220             ])
0221 
0222     def test_illegal_declarations(self):
0223         self._parse_error('<!spacer type="block" height="25">')
0224 
0225     def test_starttag_end_boundary(self):
0226         self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
0227         self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
0228 
0229     def test_buffer_artefacts(self):
0230         output = [("starttag", "a", [("b", "<")])]
0231         self._run_check(["<a b='<'>"], output)
0232         self._run_check(["<a ", "b='<'>"], output)
0233         self._run_check(["<a b", "='<'>"], output)
0234         self._run_check(["<a b=", "'<'>"], output)
0235         self._run_check(["<a b='<", "'>"], output)
0236         self._run_check(["<a b='<'", ">"], output)
0237 
0238         output = [("starttag", "a", [("b", ">")])]
0239         self._run_check(["<a b='>'>"], output)
0240         self._run_check(["<a ", "b='>'>"], output)
0241         self._run_check(["<a b", "='>'>"], output)
0242         self._run_check(["<a b=", "'>'>"], output)
0243         self._run_check(["<a b='>", "'>"], output)
0244         self._run_check(["<a b='>'", ">"], output)
0245 
0246         output = [("comment", "abc")]
0247         self._run_check(["", "<!--abc-->"], output)
0248         self._run_check(["<", "!--abc-->"], output)
0249         self._run_check(["<!", "--abc-->"], output)
0250         self._run_check(["<!-", "-abc-->"], output)
0251         self._run_check(["<!--", "abc-->"], output)
0252         self._run_check(["<!--a", "bc-->"], output)
0253         self._run_check(["<!--ab", "c-->"], output)
0254         self._run_check(["<!--abc", "-->"], output)
0255         self._run_check(["<!--abc-", "->"], output)
0256         self._run_check(["<!--abc--", ">"], output)
0257         self._run_check(["<!--abc-->", ""], output)
0258 
0259     def test_starttag_junk_chars(self):
0260         self._parse_error("</>")
0261         self._parse_error("</$>")
0262         self._parse_error("</")
0263         self._parse_error("</a")
0264         self._parse_error("<a<a>")
0265         self._parse_error("</a<a>")
0266         self._parse_error("<!")
0267         self._parse_error("<a $>")
0268         self._parse_error("<a")
0269         self._parse_error("<a foo='bar'")
0270         self._parse_error("<a foo='bar")
0271         self._parse_error("<a foo='>'")
0272         self._parse_error("<a foo='>")
0273         self._parse_error("<a foo=>")
0274 
0275     def test_declaration_junk_chars(self):
0276         self._parse_error("<!DOCTYPE foo $ >")
0277 
0278     def test_startendtag(self):
0279         self._run_check("<p/>", [
0280             ("startendtag", "p", []),
0281             ])
0282         self._run_check("<p></p>", [
0283             ("starttag", "p", []),
0284             ("endtag", "p"),
0285             ])
0286         self._run_check("<p><img src='foo' /></p>", [
0287             ("starttag", "p", []),
0288             ("startendtag", "img", [("src", "foo")]),
0289             ("endtag", "p"),
0290             ])
0291 
0292     def test_get_starttag_text(self):
0293         s = """<foo:bar   \n   one="1"\ttwo=2   >"""
0294         self._run_check_extra(s, [
0295             ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
0296             ("starttag_text", s)])
0297 
0298     def test_cdata_content(self):
0299         s = """<script> <!-- not a comment --> &not-an-entity-ref; </script>"""
0300         self._run_check(s, [
0301             ("starttag", "script", []),
0302             ("data", " <!-- not a comment --> &not-an-entity-ref; "),
0303             ("endtag", "script"),
0304             ])
0305         s = """<script> <not a='start tag'> </script>"""
0306         self._run_check(s, [
0307             ("starttag", "script", []),
0308             ("data", " <not a='start tag'> "),
0309             ("endtag", "script"),
0310             ])
0311 
0312 
0313 def test_main():
0314     test_support.run_unittest(HTMLParserTestCase)
0315 
0316 
0317 if __name__ == "__main__":
0318     test_main()
0319
Generated by PyXR 0.9.4