0001 """An extensible library for opening URLs using a variety of protocols 0002 0003 The simplest way to use this module is to call the urlopen function, 0004 which accepts a string containing a URL or a Request object (described 0005 below). It opens the URL and returns the results as file-like 0006 object; the returned object has some extra methods described below. 0007 0008 The OpenerDirector manages a collection of Handler objects that do 0009 all the actual work. Each Handler implements a particular protocol or 0010 option. The OpenerDirector is a composite object that invokes the 0011 Handlers needed to open the requested URL. For example, the 0012 HTTPHandler performs HTTP GET and POST requests and deals with 0013 non-error returns. The HTTPRedirectHandler automatically deals with 0014 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler 0015 deals with digest authentication. 0016 0017 urlopen(url, data=None) -- basic usage is that same as original 0018 urllib. pass the url and optionally data to post to an HTTP URL, and 0019 get a file-like object back. One difference is that you can also pass 0020 a Request instance instead of URL. Raises a URLError (subclass of 0021 IOError); for HTTP errors, raises an HTTPError, which can also be 0022 treated as a valid response. 0023 0024 build_opener -- function that creates a new OpenerDirector instance. 0025 will install the default handlers. accepts one or more Handlers as 0026 arguments, either instances or Handler classes that it will 0027 instantiate. if one of the argument is a subclass of the default 0028 handler, the argument will be installed instead of the default. 0029 0030 install_opener -- installs a new opener as the default opener. 0031 0032 objects of interest: 0033 OpenerDirector -- 0034 0035 Request -- an object that encapsulates the state of a request. the 0036 state can be a simple as the URL. it can also include extra HTTP 0037 headers, e.g. a User-Agent. 0038 0039 BaseHandler -- 0040 0041 exceptions: 0042 URLError-- a subclass of IOError, individual protocols have their own 0043 specific subclass 0044 0045 HTTPError-- also a valid HTTP response, so you can treat an HTTP error 0046 as an exceptional event or valid response 0047 0048 internals: 0049 BaseHandler and parent 0050 _call_chain conventions 0051 0052 Example usage: 0053 0054 import urllib2 0055 0056 # set up authentication info 0057 authinfo = urllib2.HTTPBasicAuthHandler() 0058 authinfo.add_password('realm', 'host', 'username', 'password') 0059 0060 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"}) 0061 0062 # build a new opener that adds authentication and caching FTP handlers 0063 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler) 0064 0065 # install it 0066 urllib2.install_opener(opener) 0067 0068 f = urllib2.urlopen('http://www.python.org/') 0069 0070 0071 """ 0072 0073 # XXX issues: 0074 # If an authentication error handler that tries to perform 0075 # authentication for some reason but fails, how should the error be 0076 # signalled? The client needs to know the HTTP error code. But if 0077 # the handler knows that the problem was, e.g., that it didn't know 0078 # that hash algo that requested in the challenge, it would be good to 0079 # pass that information along to the client, too. 0080 0081 # XXX to do: 0082 # name! 0083 # documentation (getting there) 0084 # complex proxies 0085 # abstract factory for opener 0086 # ftp errors aren't handled cleanly 0087 # gopher can return a socket.error 0088 # check digest against correct (i.e. non-apache) implementation 0089 0090 import base64 0091 import ftplib 0092 import gopherlib 0093 import httplib 0094 import inspect 0095 import md5 0096 import mimetypes 0097 import mimetools 0098 import os 0099 import posixpath 0100 import random 0101 import re 0102 import sha 0103 import socket 0104 import sys 0105 import time 0106 import urlparse 0107 import bisect 0108 import cookielib 0109 0110 try: 0111 from cStringIO import StringIO 0112 except ImportError: 0113 from StringIO import StringIO 0114 0115 # not sure how many of these need to be gotten rid of 0116 from urllib import (unwrap, unquote, splittype, splithost, 0117 addinfourl, splitport, splitgophertype, splitquery, 0118 splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue) 0119 0120 # support for FileHandler, proxies via environment variables 0121 from urllib import localhost, url2pathname, getproxies 0122 0123 __version__ = "2.4" 0124 0125 _opener = None 0126 def urlopen(url, data=None): 0127 global _opener 0128 if _opener is None: 0129 _opener = build_opener() 0130 return _opener.open(url, data) 0131 0132 def install_opener(opener): 0133 global _opener 0134 _opener = opener 0135 0136 # do these error classes make sense? 0137 # make sure all of the IOError stuff is overridden. we just want to be 0138 # subtypes. 0139 0140 class URLError(IOError): 0141 # URLError is a sub-type of IOError, but it doesn't share any of 0142 # the implementation. need to override __init__ and __str__. 0143 # It sets self.args for compatibility with other EnvironmentError 0144 # subclasses, but args doesn't have the typical format with errno in 0145 # slot 0 and strerror in slot 1. This may be better than nothing. 0146 def __init__(self, reason): 0147 self.args = reason, 0148 self.reason = reason 0149 0150 def __str__(self): 0151 return '<urlopen error %s>' % self.reason 0152 0153 class HTTPError(URLError, addinfourl): 0154 """Raised when HTTP error occurs, but also acts like non-error return""" 0155 __super_init = addinfourl.__init__ 0156 0157 def __init__(self, url, code, msg, hdrs, fp): 0158 self.code = code 0159 self.msg = msg 0160 self.hdrs = hdrs 0161 self.fp = fp 0162 self.filename = url 0163 # The addinfourl classes depend on fp being a valid file 0164 # object. In some cases, the HTTPError may not have a valid 0165 # file object. If this happens, the simplest workaround is to 0166 # not initialize the base classes. 0167 if fp is not None: 0168 self.__super_init(fp, hdrs, url) 0169 0170 def __str__(self): 0171 return 'HTTP Error %s: %s' % (self.code, self.msg) 0172 0173 class GopherError(URLError): 0174 pass 0175 0176 0177 class Request: 0178 0179 def __init__(self, url, data=None, headers={}, 0180 origin_req_host=None, unverifiable=False): 0181 # unwrap('<URL:type://host/path>') --> 'type://host/path' 0182 self.__original = unwrap(url) 0183 self.type = None 0184 # self.__r_type is what's left after doing the splittype 0185 self.host = None 0186 self.port = None 0187 self.data = data 0188 self.headers = {} 0189 for key, value in headers.items(): 0190 self.add_header(key, value) 0191 self.unredirected_hdrs = {} 0192 if origin_req_host is None: 0193 origin_req_host = cookielib.request_host(self) 0194 self.origin_req_host = origin_req_host 0195 self.unverifiable = unverifiable 0196 0197 def __getattr__(self, attr): 0198 # XXX this is a fallback mechanism to guard against these 0199 # methods getting called in a non-standard order. this may be 0200 # too complicated and/or unnecessary. 0201 # XXX should the __r_XXX attributes be public? 0202 if attr[:12] == '_Request__r_': 0203 name = attr[12:] 0204 if hasattr(Request, 'get_' + name): 0205 getattr(self, 'get_' + name)() 0206 return getattr(self, attr) 0207 raise AttributeError, attr 0208 0209 def get_method(self): 0210 if self.has_data(): 0211 return "POST" 0212 else: 0213 return "GET" 0214 0215 # XXX these helper methods are lame 0216 0217 def add_data(self, data): 0218 self.data = data 0219 0220 def has_data(self): 0221 return self.data is not None 0222 0223 def get_data(self): 0224 return self.data 0225 0226 def get_full_url(self): 0227 return self.__original 0228 0229 def get_type(self): 0230 if self.type is None: 0231 self.type, self.__r_type = splittype(self.__original) 0232 if self.type is None: 0233 raise ValueError, "unknown url type: %s" % self.__original 0234 return self.type 0235 0236 def get_host(self): 0237 if self.host is None: 0238 self.host, self.__r_host = splithost(self.__r_type) 0239 if self.host: 0240 self.host = unquote(self.host) 0241 return self.host 0242 0243 def get_selector(self): 0244 return self.__r_host 0245 0246 def set_proxy(self, host, type): 0247 self.host, self.type = host, type 0248 self.__r_host = self.__original 0249 0250 def get_origin_req_host(self): 0251 return self.origin_req_host 0252 0253 def is_unverifiable(self): 0254 return self.unverifiable 0255 0256 def add_header(self, key, val): 0257 # useful for something like authentication 0258 self.headers[key.capitalize()] = val 0259 0260 def add_unredirected_header(self, key, val): 0261 # will not be added to a redirected request 0262 self.unredirected_hdrs[key.capitalize()] = val 0263 0264 def has_header(self, header_name): 0265 return (header_name in self.headers or 0266 header_name in self.unredirected_hdrs) 0267 0268 def get_header(self, header_name, default=None): 0269 return self.headers.get( 0270 header_name, 0271 self.unredirected_hdrs.get(header_name, default)) 0272 0273 def header_items(self): 0274 hdrs = self.unredirected_hdrs.copy() 0275 hdrs.update(self.headers) 0276 return hdrs.items() 0277 0278 class OpenerDirector: 0279 def __init__(self): 0280 server_version = "Python-urllib/%s" % __version__ 0281 self.addheaders = [('User-agent', server_version)] 0282 # manage the individual handlers 0283 self.handlers = [] 0284 self.handle_open = {} 0285 self.handle_error = {} 0286 self.process_response = {} 0287 self.process_request = {} 0288 0289 def add_handler(self, handler): 0290 added = False 0291 for meth in dir(handler): 0292 i = meth.find("_") 0293 protocol = meth[:i] 0294 condition = meth[i+1:] 0295 0296 if condition.startswith("error"): 0297 j = condition.find("_") + i + 1 0298 kind = meth[j+1:] 0299 try: 0300 kind = int(kind) 0301 except ValueError: 0302 pass 0303 lookup = self.handle_error.get(protocol, {}) 0304 self.handle_error[protocol] = lookup 0305 elif condition == "open": 0306 kind = protocol 0307 lookup = getattr(self, "handle_"+condition) 0308 elif condition in ["response", "request"]: 0309 kind = protocol 0310 lookup = getattr(self, "process_"+condition) 0311 else: 0312 continue 0313 0314 handlers = lookup.setdefault(kind, []) 0315 if handlers: 0316 bisect.insort(handlers, handler) 0317 else: 0318 handlers.append(handler) 0319 added = True 0320 0321 if added: 0322 # XXX why does self.handlers need to be sorted? 0323 bisect.insort(self.handlers, handler) 0324 handler.add_parent(self) 0325 0326 def close(self): 0327 # Only exists for backwards compatibility. 0328 pass 0329 0330 def _call_chain(self, chain, kind, meth_name, *args): 0331 # XXX raise an exception if no one else should try to handle 0332 # this url. return None if you can't but someone else could. 0333 handlers = chain.get(kind, ()) 0334 for handler in handlers: 0335 func = getattr(handler, meth_name) 0336 0337 result = func(*args) 0338 if result is not None: 0339 return result 0340 0341 def open(self, fullurl, data=None): 0342 # accept a URL or a Request object 0343 if isinstance(fullurl, basestring): 0344 req = Request(fullurl, data) 0345 else: 0346 req = fullurl 0347 if data is not None: 0348 req.add_data(data) 0349 0350 protocol = req.get_type() 0351 0352 # pre-process request 0353 meth_name = protocol+"_request" 0354 for processor in self.process_request.get(protocol, []): 0355 meth = getattr(processor, meth_name) 0356 req = meth(req) 0357 0358 response = self._open(req, data) 0359 0360 # post-process response 0361 meth_name = protocol+"_response" 0362 for processor in self.process_response.get(protocol, []): 0363 meth = getattr(processor, meth_name) 0364 response = meth(req, response) 0365 0366 return response 0367 0368 def _open(self, req, data=None): 0369 result = self._call_chain(self.handle_open, 'default', 0370 'default_open', req) 0371 if result: 0372 return result 0373 0374 protocol = req.get_type() 0375 result = self._call_chain(self.handle_open, protocol, protocol + 0376 '_open', req) 0377 if result: 0378 return result 0379 0380 return self._call_chain(self.handle_open, 'unknown', 0381 'unknown_open', req) 0382 0383 def error(self, proto, *args): 0384 if proto in ['http', 'https']: 0385 # XXX http[s] protocols are special-cased 0386 dict = self.handle_error['http'] # https is not different than http 0387 proto = args[2] # YUCK! 0388 meth_name = 'http_error_%s' % proto 0389 http_err = 1 0390 orig_args = args 0391 else: 0392 dict = self.handle_error 0393 meth_name = proto + '_error' 0394 http_err = 0 0395 args = (dict, proto, meth_name) + args 0396 result = self._call_chain(*args) 0397 if result: 0398 return result 0399 0400 if http_err: 0401 args = (dict, 'default', 'http_error_default') + orig_args 0402 return self._call_chain(*args) 0403 0404 # XXX probably also want an abstract factory that knows when it makes 0405 # sense to skip a superclass in favor of a subclass and when it might 0406 # make sense to include both 0407 0408 def build_opener(*handlers): 0409 """Create an opener object from a list of handlers. 0410 0411 The opener will use several default handlers, including support 0412 for HTTP and FTP. 0413 0414 If any of the handlers passed as arguments are subclasses of the 0415 default handlers, the default handlers will not be used. 0416 """ 0417 0418 opener = OpenerDirector() 0419 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 0420 HTTPDefaultErrorHandler, HTTPRedirectHandler, 0421 FTPHandler, FileHandler, HTTPErrorProcessor] 0422 if hasattr(httplib, 'HTTPS'): 0423 default_classes.append(HTTPSHandler) 0424 skip = [] 0425 for klass in default_classes: 0426 for check in handlers: 0427 if inspect.isclass(check): 0428 if issubclass(check, klass): 0429 skip.append(klass) 0430 elif isinstance(check, klass): 0431 skip.append(klass) 0432 for klass in skip: 0433 default_classes.remove(klass) 0434 0435 for klass in default_classes: 0436 opener.add_handler(klass()) 0437 0438 for h in handlers: 0439 if inspect.isclass(h): 0440 h = h() 0441 opener.add_handler(h) 0442 return opener 0443 0444 class BaseHandler: 0445 handler_order = 500 0446 0447 def add_parent(self, parent): 0448 self.parent = parent 0449 0450 def close(self): 0451 # Only exists for backwards compatibility 0452 pass 0453 0454 def __lt__(self, other): 0455 if not hasattr(other, "handler_order"): 0456 # Try to preserve the old behavior of having custom classes 0457 # inserted after default ones (works only for custom user 0458 # classes which are not aware of handler_order). 0459 return True 0460 return self.handler_order < other.handler_order 0461 0462 0463 class HTTPErrorProcessor(BaseHandler): 0464 """Process HTTP error responses.""" 0465 handler_order = 1000 # after all other processing 0466 0467 def http_response(self, request, response): 0468 code, msg, hdrs = response.code, response.msg, response.info() 0469 0470 if code not in (200, 206): 0471 response = self.parent.error( 0472 'http', request, response, code, msg, hdrs) 0473 0474 return response 0475 0476 https_response = http_response 0477 0478 class HTTPDefaultErrorHandler(BaseHandler): 0479 def http_error_default(self, req, fp, code, msg, hdrs): 0480 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) 0481 0482 class HTTPRedirectHandler(BaseHandler): 0483 # maximum number of redirections to any single URL 0484 # this is needed because of the state that cookies introduce 0485 max_repeats = 4 0486 # maximum total number of redirections (regardless of URL) before 0487 # assuming we're in a loop 0488 max_redirections = 10 0489 0490 def redirect_request(self, req, fp, code, msg, headers, newurl): 0491 """Return a Request or None in response to a redirect. 0492 0493 This is called by the http_error_30x methods when a 0494 redirection response is received. If a redirection should 0495 take place, return a new Request to allow http_error_30x to 0496 perform the redirect. Otherwise, raise HTTPError if no-one 0497 else should try to handle this url. Return None if you can't 0498 but another Handler might. 0499 """ 0500 m = req.get_method() 0501 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD") 0502 or code in (301, 302, 303) and m == "POST"): 0503 # Strictly (according to RFC 2616), 301 or 302 in response 0504 # to a POST MUST NOT cause a redirection without confirmation 0505 # from the user (of urllib2, in this case). In practice, 0506 # essentially all clients do redirect in this case, so we 0507 # do the same. 0508 return Request(newurl, 0509 headers=req.headers, 0510 origin_req_host=req.get_origin_req_host(), 0511 unverifiable=True) 0512 else: 0513 raise HTTPError(req.get_full_url(), code, msg, headers, fp) 0514 0515 # Implementation note: To avoid the server sending us into an 0516 # infinite loop, the request object needs to track what URLs we 0517 # have already seen. Do this by adding a handler-specific 0518 # attribute to the Request object. 0519 def http_error_302(self, req, fp, code, msg, headers): 0520 # Some servers (incorrectly) return multiple Location headers 0521 # (so probably same goes for URI). Use first header. 0522 if 'location' in headers: 0523 newurl = headers.getheaders('location')[0] 0524 elif 'uri' in headers: 0525 newurl = headers.getheaders('uri')[0] 0526 else: 0527 return 0528 newurl = urlparse.urljoin(req.get_full_url(), newurl) 0529 0530 # XXX Probably want to forget about the state of the current 0531 # request, although that might interact poorly with other 0532 # handlers that also use handler-specific request attributes 0533 new = self.redirect_request(req, fp, code, msg, headers, newurl) 0534 if new is None: 0535 return 0536 0537 # loop detection 0538 # .redirect_dict has a key url if url was previously visited. 0539 if hasattr(req, 'redirect_dict'): 0540 visited = new.redirect_dict = req.redirect_dict 0541 if (visited.get(newurl, 0) >= self.max_repeats or 0542 len(visited) >= self.max_redirections): 0543 raise HTTPError(req.get_full_url(), code, 0544 self.inf_msg + msg, headers, fp) 0545 else: 0546 visited = new.redirect_dict = req.redirect_dict = {} 0547 visited[newurl] = visited.get(newurl, 0) + 1 0548 0549 # Don't close the fp until we are sure that we won't use it 0550 # with HTTPError. 0551 fp.read() 0552 fp.close() 0553 0554 return self.parent.open(new) 0555 0556 http_error_301 = http_error_303 = http_error_307 = http_error_302 0557 0558 inf_msg = "The HTTP server returned a redirect error that would " \ 0559 "lead to an infinite loop.\n" \ 0560 "The last 30x error message was:\n" 0561 0562 class ProxyHandler(BaseHandler): 0563 # Proxies must be in front 0564 handler_order = 100 0565 0566 def __init__(self, proxies=None): 0567 if proxies is None: 0568 proxies = getproxies() 0569 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 0570 self.proxies = proxies 0571 for type, url in proxies.items(): 0572 setattr(self, '%s_open' % type, 0573 lambda r, proxy=url, type=type, meth=self.proxy_open: \ 0574 meth(r, proxy, type)) 0575 0576 def proxy_open(self, req, proxy, type): 0577 orig_type = req.get_type() 0578 type, r_type = splittype(proxy) 0579 host, XXX = splithost(r_type) 0580 if '@' in host: 0581 user_pass, host = host.split('@', 1) 0582 if ':' in user_pass: 0583 user, password = user_pass.split(':', 1) 0584 user_pass = base64.encodestring('%s:%s' % (unquote(user), 0585 unquote(password))) 0586 req.add_header('Proxy-authorization', 'Basic ' + user_pass) 0587 host = unquote(host) 0588 req.set_proxy(host, type) 0589 if orig_type == type: 0590 # let other handlers take care of it 0591 # XXX this only makes sense if the proxy is before the 0592 # other handlers 0593 return None 0594 else: 0595 # need to start over, because the other handlers don't 0596 # grok the proxy's URL type 0597 return self.parent.open(req) 0598 0599 # feature suggested by Duncan Booth 0600 # XXX custom is not a good name 0601 class CustomProxy: 0602 # either pass a function to the constructor or override handle 0603 def __init__(self, proto, func=None, proxy_addr=None): 0604 self.proto = proto 0605 self.func = func 0606 self.addr = proxy_addr 0607 0608 def handle(self, req): 0609 if self.func and self.func(req): 0610 return 1 0611 0612 def get_proxy(self): 0613 return self.addr 0614 0615 class CustomProxyHandler(BaseHandler): 0616 # Proxies must be in front 0617 handler_order = 100 0618 0619 def __init__(self, *proxies): 0620 self.proxies = {} 0621 0622 def proxy_open(self, req): 0623 proto = req.get_type() 0624 try: 0625 proxies = self.proxies[proto] 0626 except KeyError: 0627 return None 0628 for p in proxies: 0629 if p.handle(req): 0630 req.set_proxy(p.get_proxy()) 0631 return self.parent.open(req) 0632 return None 0633 0634 def do_proxy(self, p, req): 0635 return self.parent.open(req) 0636 0637 def add_proxy(self, cpo): 0638 if cpo.proto in self.proxies: 0639 self.proxies[cpo.proto].append(cpo) 0640 else: 0641 self.proxies[cpo.proto] = [cpo] 0642 0643 class HTTPPasswordMgr: 0644 def __init__(self): 0645 self.passwd = {} 0646 0647 def add_password(self, realm, uri, user, passwd): 0648 # uri could be a single URI or a sequence 0649 if isinstance(uri, basestring): 0650 uri = [uri] 0651 uri = tuple(map(self.reduce_uri, uri)) 0652 if not realm in self.passwd: 0653 self.passwd[realm] = {} 0654 self.passwd[realm][uri] = (user, passwd) 0655 0656 def find_user_password(self, realm, authuri): 0657 domains = self.passwd.get(realm, {}) 0658 authuri = self.reduce_uri(authuri) 0659 for uris, authinfo in domains.iteritems(): 0660 for uri in uris: 0661 if self.is_suburi(uri, authuri): 0662 return authinfo 0663 return None, None 0664 0665 def reduce_uri(self, uri): 0666 """Accept netloc or URI and extract only the netloc and path""" 0667 parts = urlparse.urlparse(uri) 0668 if parts[1]: 0669 return parts[1], parts[2] or '/' 0670 else: 0671 return parts[2], '/' 0672 0673 def is_suburi(self, base, test): 0674 """Check if test is below base in a URI tree 0675 0676 Both args must be URIs in reduced form. 0677 """ 0678 if base == test: 0679 return True 0680 if base[0] != test[0]: 0681 return False 0682 common = posixpath.commonprefix((base[1], test[1])) 0683 if len(common) == len(base[1]): 0684 return True 0685 return False 0686 0687 0688 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 0689 0690 def find_user_password(self, realm, authuri): 0691 user, password = HTTPPasswordMgr.find_user_password(self, realm, 0692 authuri) 0693 if user is not None: 0694 return user, password 0695 return HTTPPasswordMgr.find_user_password(self, None, authuri) 0696 0697 0698 class AbstractBasicAuthHandler: 0699 0700 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I) 0701 0702 # XXX there can actually be multiple auth-schemes in a 0703 # www-authenticate header. should probably be a lot more careful 0704 # in parsing them to extract multiple alternatives 0705 0706 def __init__(self, password_mgr=None): 0707 if password_mgr is None: 0708 password_mgr = HTTPPasswordMgr() 0709 self.passwd = password_mgr 0710 self.add_password = self.passwd.add_password 0711 0712 def http_error_auth_reqed(self, authreq, host, req, headers): 0713 # XXX could be multiple headers 0714 authreq = headers.get(authreq, None) 0715 if authreq: 0716 mo = AbstractBasicAuthHandler.rx.search(authreq) 0717 if mo: 0718 scheme, realm = mo.groups() 0719 if scheme.lower() == 'basic': 0720 return self.retry_http_basic_auth(host, req, realm) 0721 0722 def retry_http_basic_auth(self, host, req, realm): 0723 user,pw = self.passwd.find_user_password(realm, host) 0724 if pw is not None: 0725 raw = "%s:%s" % (user, pw) 0726 auth = 'Basic %s' % base64.encodestring(raw).strip() 0727 if req.headers.get(self.auth_header, None) == auth: 0728 return None 0729 req.add_header(self.auth_header, auth) 0730 return self.parent.open(req) 0731 else: 0732 return None 0733 0734 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 0735 0736 auth_header = 'Authorization' 0737 0738 def http_error_401(self, req, fp, code, msg, headers): 0739 host = urlparse.urlparse(req.get_full_url())[1] 0740 return self.http_error_auth_reqed('www-authenticate', 0741 host, req, headers) 0742 0743 0744 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 0745 0746 auth_header = 'Proxy-authorization' 0747 0748 def http_error_407(self, req, fp, code, msg, headers): 0749 host = req.get_host() 0750 return self.http_error_auth_reqed('proxy-authenticate', 0751 host, req, headers) 0752 0753 0754 def randombytes(n): 0755 """Return n random bytes.""" 0756 # Use /dev/urandom if it is available. Fall back to random module 0757 # if not. It might be worthwhile to extend this function to use 0758 # other platform-specific mechanisms for getting random bytes. 0759 if os.path.exists("/dev/urandom"): 0760 f = open("/dev/urandom") 0761 s = f.read(n) 0762 f.close() 0763 return s 0764 else: 0765 L = [chr(random.randrange(0, 256)) for i in range(n)] 0766 return "".join(L) 0767 0768 class AbstractDigestAuthHandler: 0769 # Digest authentication is specified in RFC 2617. 0770 0771 # XXX The client does not inspect the Authentication-Info header 0772 # in a successful response. 0773 0774 # XXX It should be possible to test this implementation against 0775 # a mock server that just generates a static set of challenges. 0776 0777 # XXX qop="auth-int" supports is shaky 0778 0779 def __init__(self, passwd=None): 0780 if passwd is None: 0781 passwd = HTTPPasswordMgr() 0782 self.passwd = passwd 0783 self.add_password = self.passwd.add_password 0784 self.retried = 0 0785 self.nonce_count = 0 0786 0787 def reset_retry_count(self): 0788 self.retried = 0 0789 0790 def http_error_auth_reqed(self, auth_header, host, req, headers): 0791 authreq = headers.get(auth_header, None) 0792 if self.retried > 5: 0793 # Don't fail endlessly - if we failed once, we'll probably 0794 # fail a second time. Hm. Unless the Password Manager is 0795 # prompting for the information. Crap. This isn't great 0796 # but it's better than the current 'repeat until recursion 0797 # depth exceeded' approach <wink> 0798 raise HTTPError(req.get_full_url(), 401, "digest auth failed", 0799 headers, None) 0800 else: 0801 self.retried += 1 0802 if authreq: 0803 scheme = authreq.split()[0] 0804 if scheme.lower() == 'digest': 0805 return self.retry_http_digest_auth(req, authreq) 0806 else: 0807 raise ValueError("AbstractDigestAuthHandler doesn't know " 0808 "about %s"%(scheme)) 0809 0810 def retry_http_digest_auth(self, req, auth): 0811 token, challenge = auth.split(' ', 1) 0812 chal = parse_keqv_list(parse_http_list(challenge)) 0813 auth = self.get_authorization(req, chal) 0814 if auth: 0815 auth_val = 'Digest %s' % auth 0816 if req.headers.get(self.auth_header, None) == auth_val: 0817 return None 0818 req.add_header(self.auth_header, auth_val) 0819 resp = self.parent.open(req) 0820 return resp 0821 0822 def get_cnonce(self, nonce): 0823 # The cnonce-value is an opaque 0824 # quoted string value provided by the client and used by both client 0825 # and server to avoid chosen plaintext attacks, to provide mutual 0826 # authentication, and to provide some message integrity protection. 0827 # This isn't a fabulous effort, but it's probably Good Enough. 0828 dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(), 0829 randombytes(8))).hexdigest() 0830 return dig[:16] 0831 0832 def get_authorization(self, req, chal): 0833 try: 0834 realm = chal['realm'] 0835 nonce = chal['nonce'] 0836 qop = chal.get('qop') 0837 algorithm = chal.get('algorithm', 'MD5') 0838 # mod_digest doesn't send an opaque, even though it isn't 0839 # supposed to be optional 0840 opaque = chal.get('opaque', None) 0841 except KeyError: 0842 return None 0843 0844 H, KD = self.get_algorithm_impls(algorithm) 0845 if H is None: 0846 return None 0847 0848 user, pw = self.passwd.find_user_password(realm, req.get_full_url()) 0849 if user is None: 0850 return None 0851 0852 # XXX not implemented yet 0853 if req.has_data(): 0854 entdig = self.get_entity_digest(req.get_data(), chal) 0855 else: 0856 entdig = None 0857 0858 A1 = "%s:%s:%s" % (user, realm, pw) 0859 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET', 0860 # XXX selector: what about proxies and full urls 0861 req.get_selector()) 0862 if qop == 'auth': 0863 self.nonce_count += 1 0864 ncvalue = '%08x' % self.nonce_count 0865 cnonce = self.get_cnonce(nonce) 0866 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) 0867 respdig = KD(H(A1), noncebit) 0868 elif qop is None: 0869 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 0870 else: 0871 # XXX handle auth-int. 0872 pass 0873 0874 # XXX should the partial digests be encoded too? 0875 0876 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 0877 'response="%s"' % (user, realm, nonce, req.get_selector(), 0878 respdig) 0879 if opaque: 0880 base = base + ', opaque="%s"' % opaque 0881 if entdig: 0882 base = base + ', digest="%s"' % entdig 0883 if algorithm != 'MD5': 0884 base = base + ', algorithm="%s"' % algorithm 0885 if qop: 0886 base = base + ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 0887 return base 0888 0889 def get_algorithm_impls(self, algorithm): 0890 # lambdas assume digest modules are imported at the top level 0891 if algorithm == 'MD5': 0892 H = lambda x: md5.new(x).hexdigest() 0893 elif algorithm == 'SHA': 0894 H = lambda x: sha.new(x).hexdigest() 0895 # XXX MD5-sess 0896 KD = lambda s, d: H("%s:%s" % (s, d)) 0897 return H, KD 0898 0899 def get_entity_digest(self, data, chal): 0900 # XXX not implemented yet 0901 return None 0902 0903 0904 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 0905 """An authentication protocol defined by RFC 2069 0906 0907 Digest authentication improves on basic authentication because it 0908 does not transmit passwords in the clear. 0909 """ 0910 0911 auth_header = 'Authorization' 0912 0913 def http_error_401(self, req, fp, code, msg, headers): 0914 host = urlparse.urlparse(req.get_full_url())[1] 0915 retry = self.http_error_auth_reqed('www-authenticate', 0916 host, req, headers) 0917 self.reset_retry_count() 0918 return retry 0919 0920 0921 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 0922 0923 auth_header = 'Proxy-Authorization' 0924 0925 def http_error_407(self, req, fp, code, msg, headers): 0926 host = req.get_host() 0927 retry = self.http_error_auth_reqed('proxy-authenticate', 0928 host, req, headers) 0929 self.reset_retry_count() 0930 return retry 0931 0932 class AbstractHTTPHandler(BaseHandler): 0933 0934 def __init__(self, debuglevel=0): 0935 self._debuglevel = debuglevel 0936 0937 def set_http_debuglevel(self, level): 0938 self._debuglevel = level 0939 0940 def do_request_(self, request): 0941 host = request.get_host() 0942 if not host: 0943 raise URLError('no host given') 0944 0945 if request.has_data(): # POST 0946 data = request.get_data() 0947 if not request.has_header('Content-type'): 0948 request.add_unredirected_header( 0949 'Content-type', 0950 'application/x-www-form-urlencoded') 0951 if not request.has_header('Content-length'): 0952 request.add_unredirected_header( 0953 'Content-length', '%d' % len(data)) 0954 0955 scheme, sel = splittype(request.get_selector()) 0956 sel_host, sel_path = splithost(sel) 0957 if not request.has_header('Host'): 0958 request.add_unredirected_header('Host', sel_host or host) 0959 for name, value in self.parent.addheaders: 0960 name = name.capitalize() 0961 if not request.has_header(name): 0962 request.add_unredirected_header(name, value) 0963 0964 return request 0965 0966 def do_open(self, http_class, req): 0967 """Return an addinfourl object for the request, using http_class. 0968 0969 http_class must implement the HTTPConnection API from httplib. 0970 The addinfourl return value is a file-like object. It also 0971 has methods and attributes including: 0972 - info(): return a mimetools.Message object for the headers 0973 - geturl(): return the original request URL 0974 - code: HTTP status code 0975 """ 0976 host = req.get_host() 0977 if not host: 0978 raise URLError('no host given') 0979 0980 h = http_class(host) # will parse host:port 0981 h.set_debuglevel(self._debuglevel) 0982 0983 headers = dict(req.headers) 0984 headers.update(req.unredirected_hdrs) 0985 # We want to make an HTTP/1.1 request, but the addinfourl 0986 # class isn't prepared to deal with a persistent connection. 0987 # It will try to read all remaining data from the socket, 0988 # which will block while the server waits for the next request. 0989 # So make sure the connection gets closed after the (only) 0990 # request. 0991 headers["Connection"] = "close" 0992 try: 0993 h.request(req.get_method(), req.get_selector(), req.data, headers) 0994 r = h.getresponse() 0995 except socket.error, err: # XXX what error? 0996 raise URLError(err) 0997 0998 # Pick apart the HTTPResponse object to get the addinfourl 0999 # object initialized properly. 1000 1001 # Wrap the HTTPResponse object in socket's file object adapter 1002 # for Windows. That adapter calls recv(), so delegate recv() 1003 # to read(). This weird wrapping allows the returned object to 1004 # have readline() and readlines() methods. 1005 1006 # XXX It might be better to extract the read buffering code 1007 # out of socket._fileobject() and into a base class. 1008 1009 r.recv = r.read 1010 fp = socket._fileobject(r) 1011 1012 resp = addinfourl(fp, r.msg, req.get_full_url()) 1013 resp.code = r.status 1014 resp.msg = r.reason 1015 return resp 1016 1017 1018 class HTTPHandler(AbstractHTTPHandler): 1019 1020 def http_open(self, req): 1021 return self.do_open(httplib.HTTPConnection, req) 1022 1023 http_request = AbstractHTTPHandler.do_request_ 1024 1025 if hasattr(httplib, 'HTTPS'): 1026 class HTTPSHandler(AbstractHTTPHandler): 1027 1028 def https_open(self, req): 1029 return self.do_open(httplib.HTTPSConnection, req) 1030 1031 https_request = AbstractHTTPHandler.do_request_ 1032 1033 class HTTPCookieProcessor(BaseHandler): 1034 def __init__(self, cookiejar=None): 1035 if cookiejar is None: 1036 cookiejar = cookielib.CookieJar() 1037 self.cookiejar = cookiejar 1038 1039 def http_request(self, request): 1040 self.cookiejar.add_cookie_header(request) 1041 return request 1042 1043 def http_response(self, request, response): 1044 self.cookiejar.extract_cookies(response, request) 1045 return response 1046 1047 https_request = http_request 1048 https_response = http_response 1049 1050 class UnknownHandler(BaseHandler): 1051 def unknown_open(self, req): 1052 type = req.get_type() 1053 raise URLError('unknown url type: %s' % type) 1054 1055 def parse_keqv_list(l): 1056 """Parse list of key=value strings where keys are not duplicated.""" 1057 parsed = {} 1058 for elt in l: 1059 k, v = elt.split('=', 1) 1060 if v[0] == '"' and v[-1] == '"': 1061 v = v[1:-1] 1062 parsed[k] = v 1063 return parsed 1064 1065 def parse_http_list(s): 1066 """Parse lists as described by RFC 2068 Section 2. 1067 1068 In particular, parse comma-separated lists where the elements of 1069 the list may include quoted-strings. A quoted-string could 1070 contain a comma. 1071 """ 1072 # XXX this function could probably use more testing 1073 1074 list = [] 1075 end = len(s) 1076 i = 0 1077 inquote = 0 1078 start = 0 1079 while i < end: 1080 cur = s[i:] 1081 c = cur.find(',') 1082 q = cur.find('"') 1083 if c == -1: 1084 list.append(s[start:]) 1085 break 1086 if q == -1: 1087 if inquote: 1088 raise ValueError, "unbalanced quotes" 1089 else: 1090 list.append(s[start:i+c]) 1091 i = i + c + 1 1092 continue 1093 if inquote: 1094 if q < c: 1095 list.append(s[start:i+c]) 1096 i = i + c + 1 1097 start = i 1098 inquote = 0 1099 else: 1100 i = i + q 1101 else: 1102 if c < q: 1103 list.append(s[start:i+c]) 1104 i = i + c + 1 1105 start = i 1106 else: 1107 inquote = 1 1108 i = i + q + 1 1109 return map(lambda x: x.strip(), list) 1110 1111 class FileHandler(BaseHandler): 1112 # Use local file or FTP depending on form of URL 1113 def file_open(self, req): 1114 url = req.get_selector() 1115 if url[:2] == '//' and url[2:3] != '/': 1116 req.type = 'ftp' 1117 return self.parent.open(req) 1118 else: 1119 return self.open_local_file(req) 1120 1121 # names for the localhost 1122 names = None 1123 def get_names(self): 1124 if FileHandler.names is None: 1125 FileHandler.names = (socket.gethostbyname('localhost'), 1126 socket.gethostbyname(socket.gethostname())) 1127 return FileHandler.names 1128 1129 # not entirely sure what the rules are here 1130 def open_local_file(self, req): 1131 import email.Utils 1132 host = req.get_host() 1133 file = req.get_selector() 1134 localfile = url2pathname(file) 1135 stats = os.stat(localfile) 1136 size = stats.st_size 1137 modified = email.Utils.formatdate(stats.st_mtime, usegmt=True) 1138 mtype = mimetypes.guess_type(file)[0] 1139 headers = mimetools.Message(StringIO( 1140 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 1141 (mtype or 'text/plain', size, modified))) 1142 if host: 1143 host, port = splitport(host) 1144 if not host or \ 1145 (not port and socket.gethostbyname(host) in self.get_names()): 1146 return addinfourl(open(localfile, 'rb'), 1147 headers, 'file:'+file) 1148 raise URLError('file not on local host') 1149 1150 class FTPHandler(BaseHandler): 1151 def ftp_open(self, req): 1152 host = req.get_host() 1153 if not host: 1154 raise IOError, ('ftp error', 'no host given') 1155 host, port = splitport(host) 1156 if port is None: 1157 port = ftplib.FTP_PORT 1158 else: 1159 port = int(port) 1160 1161 # username/password handling 1162 user, host = splituser(host) 1163 if user: 1164 user, passwd = splitpasswd(user) 1165 else: 1166 passwd = None 1167 host = unquote(host) 1168 user = unquote(user or '') 1169 passwd = unquote(passwd or '') 1170 1171 try: 1172 host = socket.gethostbyname(host) 1173 except socket.error, msg: 1174 raise URLError(msg) 1175 path, attrs = splitattr(req.get_selector()) 1176 dirs = path.split('/') 1177 dirs = map(unquote, dirs) 1178 dirs, file = dirs[:-1], dirs[-1] 1179 if dirs and not dirs[0]: 1180 dirs = dirs[1:] 1181 try: 1182 fw = self.connect_ftp(user, passwd, host, port, dirs) 1183 type = file and 'I' or 'D' 1184 for attr in attrs: 1185 attr, value = splitvalue(attr) 1186 if attr.lower() == 'type' and \ 1187 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1188 type = value.upper() 1189 fp, retrlen = fw.retrfile(file, type) 1190 headers = "" 1191 mtype = mimetypes.guess_type(req.get_full_url())[0] 1192 if mtype: 1193 headers += "Content-type: %s\n" % mtype 1194 if retrlen is not None and retrlen >= 0: 1195 headers += "Content-length: %d\n" % retrlen 1196 sf = StringIO(headers) 1197 headers = mimetools.Message(sf) 1198 return addinfourl(fp, headers, req.get_full_url()) 1199 except ftplib.all_errors, msg: 1200 raise IOError, ('ftp error', msg), sys.exc_info()[2] 1201 1202 def connect_ftp(self, user, passwd, host, port, dirs): 1203 fw = ftpwrapper(user, passwd, host, port, dirs) 1204 ## fw.ftp.set_debuglevel(1) 1205 return fw 1206 1207 class CacheFTPHandler(FTPHandler): 1208 # XXX would be nice to have pluggable cache strategies 1209 # XXX this stuff is definitely not thread safe 1210 def __init__(self): 1211 self.cache = {} 1212 self.timeout = {} 1213 self.soonest = 0 1214 self.delay = 60 1215 self.max_conns = 16 1216 1217 def setTimeout(self, t): 1218 self.delay = t 1219 1220 def setMaxConns(self, m): 1221 self.max_conns = m 1222 1223 def connect_ftp(self, user, passwd, host, port, dirs): 1224 key = user, host, port, '/'.join(dirs) 1225 if key in self.cache: 1226 self.timeout[key] = time.time() + self.delay 1227 else: 1228 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs) 1229 self.timeout[key] = time.time() + self.delay 1230 self.check_cache() 1231 return self.cache[key] 1232 1233 def check_cache(self): 1234 # first check for old ones 1235 t = time.time() 1236 if self.soonest <= t: 1237 for k, v in self.timeout.items(): 1238 if v < t: 1239 self.cache[k].close() 1240 del self.cache[k] 1241 del self.timeout[k] 1242 self.soonest = min(self.timeout.values()) 1243 1244 # then check the size 1245 if len(self.cache) == self.max_conns: 1246 for k, v in self.timeout.items(): 1247 if v == self.soonest: 1248 del self.cache[k] 1249 del self.timeout[k] 1250 break 1251 self.soonest = min(self.timeout.values()) 1252 1253 class GopherHandler(BaseHandler): 1254 def gopher_open(self, req): 1255 host = req.get_host() 1256 if not host: 1257 raise GopherError('no host given') 1258 host = unquote(host) 1259 selector = req.get_selector() 1260 type, selector = splitgophertype(selector) 1261 selector, query = splitquery(selector) 1262 selector = unquote(selector) 1263 if query: 1264 query = unquote(query) 1265 fp = gopherlib.send_query(selector, query, host) 1266 else: 1267 fp = gopherlib.send_selector(selector, host) 1268 return addinfourl(fp, noheaders(), req.get_full_url()) 1269 1270 #bleck! don't use this yet 1271 class OpenerFactory: 1272 1273 default_handlers = [UnknownHandler, HTTPHandler, 1274 HTTPDefaultErrorHandler, HTTPRedirectHandler, 1275 FTPHandler, FileHandler] 1276 handlers = [] 1277 replacement_handlers = [] 1278 1279 def add_handler(self, h): 1280 self.handlers = self.handlers + [h] 1281 1282 def replace_handler(self, h): 1283 pass 1284 1285 def build_opener(self): 1286 opener = OpenerDirector() 1287 for ph in self.default_handlers: 1288 if inspect.isclass(ph): 1289 ph = ph() 1290 opener.add_handler(ph) 1291
Generated by PyXR 0.9.4