PyXR

c:\python24\lib \ test \ test_robotparser.py



0001 import unittest, StringIO, robotparser
0002 from test import test_support
0003 
0004 class RobotTestCase(unittest.TestCase):
0005     def __init__(self, index, parser, url, good, agent):
0006         unittest.TestCase.__init__(self)
0007         if good:
0008             self.str = "RobotTest(%d, good, %s)" % (index, url)
0009         else:
0010             self.str = "RobotTest(%d, bad, %s)" % (index, url)
0011         self.parser = parser
0012         self.url = url
0013         self.good = good
0014         self.agent = agent
0015 
0016     def runTest(self):
0017         if isinstance(self.url, tuple):
0018             agent, url = self.url
0019         else:
0020             url = self.url
0021             agent = self.agent
0022         if self.good:
0023             self.failUnless(self.parser.can_fetch(agent, url))
0024         else:
0025             self.failIf(self.parser.can_fetch(agent, url))
0026 
0027     def __str__(self):
0028         return self.str
0029 
0030 tests = unittest.TestSuite()
0031 
0032 def RobotTest(index, robots_txt, good_urls, bad_urls,
0033               agent="test_robotparser"):
0034 
0035     lines = StringIO.StringIO(robots_txt).readlines()
0036     parser = robotparser.RobotFileParser()
0037     parser.parse(lines)
0038     for url in good_urls:
0039         tests.addTest(RobotTestCase(index, parser, url, 1, agent))
0040     for url in bad_urls:
0041         tests.addTest(RobotTestCase(index, parser, url, 0, agent))
0042 
0043 # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
0044 
0045 # 1.
0046 doc = """
0047 User-agent: *
0048 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
0049 Disallow: /tmp/ # these will soon disappear
0050 Disallow: /foo.html
0051 """
0052 
0053 good = ['/','/test.html']
0054 bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
0055 
0056 RobotTest(1, doc, good, bad)
0057 
0058 # 2.
0059 doc = """
0060 # robots.txt for http://www.example.com/
0061 
0062 User-agent: *
0063 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
0064 
0065 # Cybermapper knows where to go.
0066 User-agent: cybermapper
0067 Disallow:
0068 
0069 """
0070 
0071 good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
0072 bad = ['/cyberworld/map/index.html']
0073 
0074 RobotTest(2, doc, good, bad)
0075 
0076 # 3.
0077 doc = """
0078 # go away
0079 User-agent: *
0080 Disallow: /
0081 """
0082 
0083 good = []
0084 bad = ['/cyberworld/map/index.html','/','/tmp/']
0085 
0086 RobotTest(3, doc, good, bad)
0087 
0088 # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
0089 
0090 # 4.
0091 doc = """
0092 User-agent: figtree
0093 Disallow: /tmp
0094 Disallow: /a%3cd.html
0095 Disallow: /a%2fb.html
0096 Disallow: /%7ejoe/index.html
0097 """
0098 
0099 good = [] # XFAIL '/a/b.html'
0100 bad = ['/tmp','/tmp.html','/tmp/a.html',
0101        '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
0102        '/~joe/index.html'
0103        ]
0104 
0105 RobotTest(4, doc, good, bad, 'figtree')
0106 RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
0107 
0108 # 6.
0109 doc = """
0110 User-agent: *
0111 Disallow: /tmp/
0112 Disallow: /a%3Cd.html
0113 Disallow: /a/b.html
0114 Disallow: /%7ejoe/index.html
0115 """
0116 
0117 good = ['/tmp',] # XFAIL: '/a%2fb.html'
0118 bad = ['/tmp/','/tmp/a.html',
0119        '/a%3cd.html','/a%3Cd.html',"/a/b.html",
0120        '/%7Ejoe/index.html']
0121 
0122 RobotTest(6, doc, good, bad)
0123 
0124 # From bug report #523041
0125 
0126 # 7.
0127 doc = """
0128 User-Agent: *
0129 Disallow: /.
0130 """
0131 
0132 good = ['/foo.html']
0133 bad = [] # Bug report says "/" should be denied, but that is not in the RFC
0134 
0135 RobotTest(7, doc, good, bad)
0136 
0137 def test_main():
0138     test_support.run_suite(tests)
0139 
0140 if __name__=='__main__':
0141     test_support.Verbose = 1
0142     test_support.run_suite(tests)
0143 

Generated by PyXR 0.9.4
SourceForge.net Logo