0001 """ robotparser.py 0002 0003 Copyright (C) 2000 Bastian Kleineidam 0004 0005 You can choose between two licenses when using this package: 0006 1) GNU GPLv2 0007 2) PSF license for Python 2.2 0008 0009 The robots.txt Exclusion Protocol is implemented as specified in 0010 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html 0011 """ 0012 import urlparse,urllib 0013 0014 __all__ = ["RobotFileParser"] 0015 0016 debug = 0 0017 0018 def _debug(msg): 0019 if debug: print msg 0020 0021 0022 class RobotFileParser: 0023 """ This class provides a set of methods to read, parse and answer 0024 questions about a single robots.txt file. 0025 0026 """ 0027 0028 def __init__(self, url=''): 0029 self.entries = [] 0030 self.default_entry = None 0031 self.disallow_all = False 0032 self.allow_all = False 0033 self.set_url(url) 0034 self.last_checked = 0 0035 0036 def mtime(self): 0037 """Returns the time the robots.txt file was last fetched. 0038 0039 This is useful for long-running web spiders that need to 0040 check for new robots.txt files periodically. 0041 0042 """ 0043 return self.last_checked 0044 0045 def modified(self): 0046 """Sets the time the robots.txt file was last fetched to the 0047 current time. 0048 0049 """ 0050 import time 0051 self.last_checked = time.time() 0052 0053 def set_url(self, url): 0054 """Sets the URL referring to a robots.txt file.""" 0055 self.url = url 0056 self.host, self.path = urlparse.urlparse(url)[1:3] 0057 0058 def read(self): 0059 """Reads the robots.txt URL and feeds it to the parser.""" 0060 opener = URLopener() 0061 f = opener.open(self.url) 0062 lines = [] 0063 line = f.readline() 0064 while line: 0065 lines.append(line.strip()) 0066 line = f.readline() 0067 self.errcode = opener.errcode 0068 if self.errcode == 401 or self.errcode == 403: 0069 self.disallow_all = True 0070 _debug("disallow all") 0071 elif self.errcode >= 400: 0072 self.allow_all = True 0073 _debug("allow all") 0074 elif self.errcode == 200 and lines: 0075 _debug("parse lines") 0076 self.parse(lines) 0077 0078 def _add_entry(self, entry): 0079 if "*" in entry.useragents: 0080 # the default entry is considered last 0081 self.default_entry = entry 0082 else: 0083 self.entries.append(entry) 0084 0085 def parse(self, lines): 0086 """parse the input lines from a robots.txt file. 0087 We allow that a user-agent: line is not preceded by 0088 one or more blank lines.""" 0089 state = 0 0090 linenumber = 0 0091 entry = Entry() 0092 0093 for line in lines: 0094 linenumber = linenumber + 1 0095 if not line: 0096 if state==1: 0097 _debug("line %d: warning: you should insert" 0098 " allow: or disallow: directives below any" 0099 " user-agent: line" % linenumber) 0100 entry = Entry() 0101 state = 0 0102 elif state==2: 0103 self._add_entry(entry) 0104 entry = Entry() 0105 state = 0 0106 # remove optional comment and strip line 0107 i = line.find('#') 0108 if i>=0: 0109 line = line[:i] 0110 line = line.strip() 0111 if not line: 0112 continue 0113 line = line.split(':', 1) 0114 if len(line) == 2: 0115 line[0] = line[0].strip().lower() 0116 line[1] = urllib.unquote(line[1].strip()) 0117 if line[0] == "user-agent": 0118 if state==2: 0119 _debug("line %d: warning: you should insert a blank" 0120 " line before any user-agent" 0121 " directive" % linenumber) 0122 self._add_entry(entry) 0123 entry = Entry() 0124 entry.useragents.append(line[1]) 0125 state = 1 0126 elif line[0] == "disallow": 0127 if state==0: 0128 _debug("line %d: error: you must insert a user-agent:" 0129 " directive before this line" % linenumber) 0130 else: 0131 entry.rulelines.append(RuleLine(line[1], False)) 0132 state = 2 0133 elif line[0] == "allow": 0134 if state==0: 0135 _debug("line %d: error: you must insert a user-agent:" 0136 " directive before this line" % linenumber) 0137 else: 0138 entry.rulelines.append(RuleLine(line[1], True)) 0139 else: 0140 _debug("line %d: warning: unknown key %s" % (linenumber, 0141 line[0])) 0142 else: 0143 _debug("line %d: error: malformed line %s"%(linenumber, line)) 0144 if state==2: 0145 self.entries.append(entry) 0146 _debug("Parsed rules:\n%s" % str(self)) 0147 0148 0149 def can_fetch(self, useragent, url): 0150 """using the parsed robots.txt decide if useragent can fetch url""" 0151 _debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" % 0152 (useragent, url)) 0153 if self.disallow_all: 0154 return False 0155 if self.allow_all: 0156 return True 0157 # search for given user agent matches 0158 # the first match counts 0159 url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" 0160 for entry in self.entries: 0161 if entry.applies_to(useragent): 0162 return entry.allowance(url) 0163 # try the default entry last 0164 if self.default_entry: 0165 return self.default_entry.allowance(url) 0166 # agent not found ==> access granted 0167 return True 0168 0169 0170 def __str__(self): 0171 ret = "" 0172 for entry in self.entries: 0173 ret = ret + str(entry) + "\n" 0174 return ret 0175 0176 0177 class RuleLine: 0178 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" 0179 (allowance==False) followed by a path.""" 0180 def __init__(self, path, allowance): 0181 if path == '' and not allowance: 0182 # an empty value means allow all 0183 allowance = True 0184 self.path = urllib.quote(path) 0185 self.allowance = allowance 0186 0187 def applies_to(self, filename): 0188 return self.path=="*" or filename.startswith(self.path) 0189 0190 def __str__(self): 0191 return (self.allowance and "Allow" or "Disallow")+": "+self.path 0192 0193 0194 class Entry: 0195 """An entry has one or more user-agents and zero or more rulelines""" 0196 def __init__(self): 0197 self.useragents = [] 0198 self.rulelines = [] 0199 0200 def __str__(self): 0201 ret = "" 0202 for agent in self.useragents: 0203 ret = ret + "User-agent: "+agent+"\n" 0204 for line in self.rulelines: 0205 ret = ret + str(line) + "\n" 0206 return ret 0207 0208 def applies_to(self, useragent): 0209 """check if this entry applies to the specified agent""" 0210 # split the name token and make it lower case 0211 useragent = useragent.split("/")[0].lower() 0212 for agent in self.useragents: 0213 if agent=='*': 0214 # we have the catch-all agent 0215 return True 0216 agent = agent.lower() 0217 if agent in useragent: 0218 return True 0219 return False 0220 0221 def allowance(self, filename): 0222 """Preconditions: 0223 - our agent applies to this entry 0224 - filename is URL decoded""" 0225 for line in self.rulelines: 0226 _debug((filename, str(line), line.allowance)) 0227 if line.applies_to(filename): 0228 return line.allowance 0229 return True 0230 0231 class URLopener(urllib.FancyURLopener): 0232 def __init__(self, *args): 0233 urllib.FancyURLopener.__init__(self, *args) 0234 self.errcode = 200 0235 0236 def http_error_default(self, url, fp, errcode, errmsg, headers): 0237 self.errcode = errcode 0238 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, 0239 errmsg, headers) 0240 0241 def _check(a,b): 0242 if not b: 0243 ac = "access denied" 0244 else: 0245 ac = "access allowed" 0246 if a!=b: 0247 print "failed" 0248 else: 0249 print "ok (%s)" % ac 0250 print 0251 0252 def _test(): 0253 global debug 0254 rp = RobotFileParser() 0255 debug = 1 0256 0257 # robots.txt that exists, gotten to by redirection 0258 rp.set_url('http://www.musi-cal.com/robots.txt') 0259 rp.read() 0260 0261 # test for re.escape 0262 _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) 0263 # this should match the first rule, which is a disallow 0264 _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) 0265 # various cherry pickers 0266 _check(rp.can_fetch('CherryPickerSE', 0267 'http://www.musi-cal.com/cgi-bin/event-search' 0268 '?city=San+Francisco'), 0) 0269 _check(rp.can_fetch('CherryPickerSE/1.0', 0270 'http://www.musi-cal.com/cgi-bin/event-search' 0271 '?city=San+Francisco'), 0) 0272 _check(rp.can_fetch('CherryPickerSE/1.5', 0273 'http://www.musi-cal.com/cgi-bin/event-search' 0274 '?city=San+Francisco'), 0) 0275 # case sensitivity 0276 _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) 0277 _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) 0278 # substring test 0279 _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) 0280 # tests for catch-all * agent 0281 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0) 0282 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) 0283 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) 0284 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) 0285 0286 # robots.txt that does not exist 0287 rp.set_url('http://www.lycos.com/robots.txt') 0288 rp.read() 0289 _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1) 0290 0291 if __name__ == '__main__': 0292 _test() 0293
Generated by PyXR 0.9.4