PyXR

c:\python24\lib \ urllib2.py


0001 """An extensible library for opening URLs using a variety of protocols
0002 
0003 The simplest way to use this module is to call the urlopen function,
0004 which accepts a string containing a URL or a Request object (described
0005 below).  It opens the URL and returns the results as file-like
0006 object; the returned object has some extra methods described below.
0007 
0008 The OpenerDirector manages a collection of Handler objects that do
0009 all the actual work.  Each Handler implements a particular protocol or
0010 option.  The OpenerDirector is a composite object that invokes the
0011 Handlers needed to open the requested URL.  For example, the
0012 HTTPHandler performs HTTP GET and POST requests and deals with
0013 non-error returns.  The HTTPRedirectHandler automatically deals with
0014 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
0015 deals with digest authentication.
0016 
0017 urlopen(url, data=None) -- basic usage is that same as original
0018 urllib.  pass the url and optionally data to post to an HTTP URL, and
0019 get a file-like object back.  One difference is that you can also pass
0020 a Request instance instead of URL.  Raises a URLError (subclass of
0021 IOError); for HTTP errors, raises an HTTPError, which can also be
0022 treated as a valid response.
0023 
0024 build_opener -- function that creates a new OpenerDirector instance.
0025 will install the default handlers.  accepts one or more Handlers as
0026 arguments, either instances or Handler classes that it will
0027 instantiate.  if one of the argument is a subclass of the default
0028 handler, the argument will be installed instead of the default.
0029 
0030 install_opener -- installs a new opener as the default opener.
0031 
0032 objects of interest:
0033 OpenerDirector --
0034 
0035 Request -- an object that encapsulates the state of a request.  the
0036 state can be a simple as the URL.  it can also include extra HTTP
0037 headers, e.g. a User-Agent.
0038 
0039 BaseHandler --
0040 
0041 exceptions:
0042 URLError-- a subclass of IOError, individual protocols have their own
0043 specific subclass
0044 
0045 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
0046 as an exceptional event or valid response
0047 
0048 internals:
0049 BaseHandler and parent
0050 _call_chain conventions
0051 
0052 Example usage:
0053 
0054 import urllib2
0055 
0056 # set up authentication info
0057 authinfo = urllib2.HTTPBasicAuthHandler()
0058 authinfo.add_password('realm', 'host', 'username', 'password')
0059 
0060 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
0061 
0062 # build a new opener that adds authentication and caching FTP handlers
0063 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
0064 
0065 # install it
0066 urllib2.install_opener(opener)
0067 
0068 f = urllib2.urlopen('http://www.python.org/')
0069 
0070 
0071 """
0072 
0073 # XXX issues:
0074 # If an authentication error handler that tries to perform
0075 # authentication for some reason but fails, how should the error be
0076 # signalled?  The client needs to know the HTTP error code.  But if
0077 # the handler knows that the problem was, e.g., that it didn't know
0078 # that hash algo that requested in the challenge, it would be good to
0079 # pass that information along to the client, too.
0080 
0081 # XXX to do:
0082 # name!
0083 # documentation (getting there)
0084 # complex proxies
0085 # abstract factory for opener
0086 # ftp errors aren't handled cleanly
0087 # gopher can return a socket.error
0088 # check digest against correct (i.e. non-apache) implementation
0089 
0090 import base64
0091 import ftplib
0092 import gopherlib
0093 import httplib
0094 import inspect
0095 import md5
0096 import mimetypes
0097 import mimetools
0098 import os
0099 import posixpath
0100 import random
0101 import re
0102 import sha
0103 import socket
0104 import sys
0105 import time
0106 import urlparse
0107 import bisect
0108 import cookielib
0109 
0110 try:
0111     from cStringIO import StringIO
0112 except ImportError:
0113     from StringIO import StringIO
0114 
0115 # not sure how many of these need to be gotten rid of
0116 from urllib import (unwrap, unquote, splittype, splithost,
0117      addinfourl, splitport, splitgophertype, splitquery,
0118      splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
0119 
0120 # support for FileHandler, proxies via environment variables
0121 from urllib import localhost, url2pathname, getproxies
0122 
0123 __version__ = "2.4"
0124 
0125 _opener = None
0126 def urlopen(url, data=None):
0127     global _opener
0128     if _opener is None:
0129         _opener = build_opener()
0130     return _opener.open(url, data)
0131 
0132 def install_opener(opener):
0133     global _opener
0134     _opener = opener
0135 
0136 # do these error classes make sense?
0137 # make sure all of the IOError stuff is overridden.  we just want to be
0138 # subtypes.
0139 
0140 class URLError(IOError):
0141     # URLError is a sub-type of IOError, but it doesn't share any of
0142     # the implementation.  need to override __init__ and __str__.
0143     # It sets self.args for compatibility with other EnvironmentError
0144     # subclasses, but args doesn't have the typical format with errno in
0145     # slot 0 and strerror in slot 1.  This may be better than nothing.
0146     def __init__(self, reason):
0147         self.args = reason,
0148         self.reason = reason
0149 
0150     def __str__(self):
0151         return '<urlopen error %s>' % self.reason
0152 
0153 class HTTPError(URLError, addinfourl):
0154     """Raised when HTTP error occurs, but also acts like non-error return"""
0155     __super_init = addinfourl.__init__
0156 
0157     def __init__(self, url, code, msg, hdrs, fp):
0158         self.code = code
0159         self.msg = msg
0160         self.hdrs = hdrs
0161         self.fp = fp
0162         self.filename = url
0163         # The addinfourl classes depend on fp being a valid file
0164         # object.  In some cases, the HTTPError may not have a valid
0165         # file object.  If this happens, the simplest workaround is to
0166         # not initialize the base classes.
0167         if fp is not None:
0168             self.__super_init(fp, hdrs, url)
0169 
0170     def __str__(self):
0171         return 'HTTP Error %s: %s' % (self.code, self.msg)
0172 
0173 class GopherError(URLError):
0174     pass
0175 
0176 
0177 class Request:
0178 
0179     def __init__(self, url, data=None, headers={},
0180                  origin_req_host=None, unverifiable=False):
0181         # unwrap('<URL:type://host/path>') --> 'type://host/path'
0182         self.__original = unwrap(url)
0183         self.type = None
0184         # self.__r_type is what's left after doing the splittype
0185         self.host = None
0186         self.port = None
0187         self.data = data
0188         self.headers = {}
0189         for key, value in headers.items():
0190             self.add_header(key, value)
0191         self.unredirected_hdrs = {}
0192         if origin_req_host is None:
0193             origin_req_host = cookielib.request_host(self)
0194         self.origin_req_host = origin_req_host
0195         self.unverifiable = unverifiable
0196 
0197     def __getattr__(self, attr):
0198         # XXX this is a fallback mechanism to guard against these
0199         # methods getting called in a non-standard order.  this may be
0200         # too complicated and/or unnecessary.
0201         # XXX should the __r_XXX attributes be public?
0202         if attr[:12] == '_Request__r_':
0203             name = attr[12:]
0204             if hasattr(Request, 'get_' + name):
0205                 getattr(self, 'get_' + name)()
0206                 return getattr(self, attr)
0207         raise AttributeError, attr
0208 
0209     def get_method(self):
0210         if self.has_data():
0211             return "POST"
0212         else:
0213             return "GET"
0214 
0215     # XXX these helper methods are lame
0216 
0217     def add_data(self, data):
0218         self.data = data
0219 
0220     def has_data(self):
0221         return self.data is not None
0222 
0223     def get_data(self):
0224         return self.data
0225 
0226     def get_full_url(self):
0227         return self.__original
0228 
0229     def get_type(self):
0230         if self.type is None:
0231             self.type, self.__r_type = splittype(self.__original)
0232             if self.type is None:
0233                 raise ValueError, "unknown url type: %s" % self.__original
0234         return self.type
0235 
0236     def get_host(self):
0237         if self.host is None:
0238             self.host, self.__r_host = splithost(self.__r_type)
0239             if self.host:
0240                 self.host = unquote(self.host)
0241         return self.host
0242 
0243     def get_selector(self):
0244         return self.__r_host
0245 
0246     def set_proxy(self, host, type):
0247         self.host, self.type = host, type
0248         self.__r_host = self.__original
0249 
0250     def get_origin_req_host(self):
0251         return self.origin_req_host
0252 
0253     def is_unverifiable(self):
0254         return self.unverifiable
0255 
0256     def add_header(self, key, val):
0257         # useful for something like authentication
0258         self.headers[key.capitalize()] = val
0259 
0260     def add_unredirected_header(self, key, val):
0261         # will not be added to a redirected request
0262         self.unredirected_hdrs[key.capitalize()] = val
0263 
0264     def has_header(self, header_name):
0265         return (header_name in self.headers or
0266                 header_name in self.unredirected_hdrs)
0267 
0268     def get_header(self, header_name, default=None):
0269         return self.headers.get(
0270             header_name,
0271             self.unredirected_hdrs.get(header_name, default))
0272 
0273     def header_items(self):
0274         hdrs = self.unredirected_hdrs.copy()
0275         hdrs.update(self.headers)
0276         return hdrs.items()
0277 
0278 class OpenerDirector:
0279     def __init__(self):
0280         server_version = "Python-urllib/%s" % __version__
0281         self.addheaders = [('User-agent', server_version)]
0282         # manage the individual handlers
0283         self.handlers = []
0284         self.handle_open = {}
0285         self.handle_error = {}
0286         self.process_response = {}
0287         self.process_request = {}
0288 
0289     def add_handler(self, handler):
0290         added = False
0291         for meth in dir(handler):
0292             i = meth.find("_")
0293             protocol = meth[:i]
0294             condition = meth[i+1:]
0295 
0296             if condition.startswith("error"):
0297                 j = condition.find("_") + i + 1
0298                 kind = meth[j+1:]
0299                 try:
0300                     kind = int(kind)
0301                 except ValueError:
0302                     pass
0303                 lookup = self.handle_error.get(protocol, {})
0304                 self.handle_error[protocol] = lookup
0305             elif condition == "open":
0306                 kind = protocol
0307                 lookup = getattr(self, "handle_"+condition)
0308             elif condition in ["response", "request"]:
0309                 kind = protocol
0310                 lookup = getattr(self, "process_"+condition)
0311             else:
0312                 continue
0313 
0314             handlers = lookup.setdefault(kind, [])
0315             if handlers:
0316                 bisect.insort(handlers, handler)
0317             else:
0318                 handlers.append(handler)
0319             added = True
0320 
0321         if added:
0322             # XXX why does self.handlers need to be sorted?
0323             bisect.insort(self.handlers, handler)
0324             handler.add_parent(self)
0325 
0326     def close(self):
0327         # Only exists for backwards compatibility.
0328         pass
0329 
0330     def _call_chain(self, chain, kind, meth_name, *args):
0331         # XXX raise an exception if no one else should try to handle
0332         # this url.  return None if you can't but someone else could.
0333         handlers = chain.get(kind, ())
0334         for handler in handlers:
0335             func = getattr(handler, meth_name)
0336 
0337             result = func(*args)
0338             if result is not None:
0339                 return result
0340 
0341     def open(self, fullurl, data=None):
0342         # accept a URL or a Request object
0343         if isinstance(fullurl, basestring):
0344             req = Request(fullurl, data)
0345         else:
0346             req = fullurl
0347             if data is not None:
0348                 req.add_data(data)
0349 
0350         protocol = req.get_type()
0351 
0352         # pre-process request
0353         meth_name = protocol+"_request"
0354         for processor in self.process_request.get(protocol, []):
0355             meth = getattr(processor, meth_name)
0356             req = meth(req)
0357 
0358         response = self._open(req, data)
0359 
0360         # post-process response
0361         meth_name = protocol+"_response"
0362         for processor in self.process_response.get(protocol, []):
0363             meth = getattr(processor, meth_name)
0364             response = meth(req, response)
0365 
0366         return response
0367 
0368     def _open(self, req, data=None):
0369         result = self._call_chain(self.handle_open, 'default',
0370                                   'default_open', req)
0371         if result:
0372             return result
0373 
0374         protocol = req.get_type()
0375         result = self._call_chain(self.handle_open, protocol, protocol +
0376                                   '_open', req)
0377         if result:
0378             return result
0379 
0380         return self._call_chain(self.handle_open, 'unknown',
0381                                 'unknown_open', req)
0382 
0383     def error(self, proto, *args):
0384         if proto in ['http', 'https']:
0385             # XXX http[s] protocols are special-cased
0386             dict = self.handle_error['http'] # https is not different than http
0387             proto = args[2]  # YUCK!
0388             meth_name = 'http_error_%s' % proto
0389             http_err = 1
0390             orig_args = args
0391         else:
0392             dict = self.handle_error
0393             meth_name = proto + '_error'
0394             http_err = 0
0395         args = (dict, proto, meth_name) + args
0396         result = self._call_chain(*args)
0397         if result:
0398             return result
0399 
0400         if http_err:
0401             args = (dict, 'default', 'http_error_default') + orig_args
0402             return self._call_chain(*args)
0403 
0404 # XXX probably also want an abstract factory that knows when it makes
0405 # sense to skip a superclass in favor of a subclass and when it might
0406 # make sense to include both
0407 
0408 def build_opener(*handlers):
0409     """Create an opener object from a list of handlers.
0410 
0411     The opener will use several default handlers, including support
0412     for HTTP and FTP.
0413 
0414     If any of the handlers passed as arguments are subclasses of the
0415     default handlers, the default handlers will not be used.
0416     """
0417 
0418     opener = OpenerDirector()
0419     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
0420                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
0421                        FTPHandler, FileHandler, HTTPErrorProcessor]
0422     if hasattr(httplib, 'HTTPS'):
0423         default_classes.append(HTTPSHandler)
0424     skip = []
0425     for klass in default_classes:
0426         for check in handlers:
0427             if inspect.isclass(check):
0428                 if issubclass(check, klass):
0429                     skip.append(klass)
0430             elif isinstance(check, klass):
0431                 skip.append(klass)
0432     for klass in skip:
0433         default_classes.remove(klass)
0434 
0435     for klass in default_classes:
0436         opener.add_handler(klass())
0437 
0438     for h in handlers:
0439         if inspect.isclass(h):
0440             h = h()
0441         opener.add_handler(h)
0442     return opener
0443 
0444 class BaseHandler:
0445     handler_order = 500
0446 
0447     def add_parent(self, parent):
0448         self.parent = parent
0449 
0450     def close(self):
0451         # Only exists for backwards compatibility
0452         pass
0453 
0454     def __lt__(self, other):
0455         if not hasattr(other, "handler_order"):
0456             # Try to preserve the old behavior of having custom classes
0457             # inserted after default ones (works only for custom user
0458             # classes which are not aware of handler_order).
0459             return True
0460         return self.handler_order < other.handler_order
0461 
0462 
0463 class HTTPErrorProcessor(BaseHandler):
0464     """Process HTTP error responses."""
0465     handler_order = 1000  # after all other processing
0466 
0467     def http_response(self, request, response):
0468         code, msg, hdrs = response.code, response.msg, response.info()
0469 
0470         if code not in (200, 206):
0471             response = self.parent.error(
0472                 'http', request, response, code, msg, hdrs)
0473 
0474         return response
0475 
0476     https_response = http_response
0477 
0478 class HTTPDefaultErrorHandler(BaseHandler):
0479     def http_error_default(self, req, fp, code, msg, hdrs):
0480         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
0481 
0482 class HTTPRedirectHandler(BaseHandler):
0483     # maximum number of redirections to any single URL
0484     # this is needed because of the state that cookies introduce
0485     max_repeats = 4
0486     # maximum total number of redirections (regardless of URL) before
0487     # assuming we're in a loop
0488     max_redirections = 10
0489 
0490     def redirect_request(self, req, fp, code, msg, headers, newurl):
0491         """Return a Request or None in response to a redirect.
0492 
0493         This is called by the http_error_30x methods when a
0494         redirection response is received.  If a redirection should
0495         take place, return a new Request to allow http_error_30x to
0496         perform the redirect.  Otherwise, raise HTTPError if no-one
0497         else should try to handle this url.  Return None if you can't
0498         but another Handler might.
0499         """
0500         m = req.get_method()
0501         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
0502             or code in (301, 302, 303) and m == "POST"):
0503             # Strictly (according to RFC 2616), 301 or 302 in response
0504             # to a POST MUST NOT cause a redirection without confirmation
0505             # from the user (of urllib2, in this case).  In practice,
0506             # essentially all clients do redirect in this case, so we
0507             # do the same.
0508             return Request(newurl,
0509                            headers=req.headers,
0510                            origin_req_host=req.get_origin_req_host(),
0511                            unverifiable=True)
0512         else:
0513             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
0514 
0515     # Implementation note: To avoid the server sending us into an
0516     # infinite loop, the request object needs to track what URLs we
0517     # have already seen.  Do this by adding a handler-specific
0518     # attribute to the Request object.
0519     def http_error_302(self, req, fp, code, msg, headers):
0520         # Some servers (incorrectly) return multiple Location headers
0521         # (so probably same goes for URI).  Use first header.
0522         if 'location' in headers:
0523             newurl = headers.getheaders('location')[0]
0524         elif 'uri' in headers:
0525             newurl = headers.getheaders('uri')[0]
0526         else:
0527             return
0528         newurl = urlparse.urljoin(req.get_full_url(), newurl)
0529 
0530         # XXX Probably want to forget about the state of the current
0531         # request, although that might interact poorly with other
0532         # handlers that also use handler-specific request attributes
0533         new = self.redirect_request(req, fp, code, msg, headers, newurl)
0534         if new is None:
0535             return
0536 
0537         # loop detection
0538         # .redirect_dict has a key url if url was previously visited.
0539         if hasattr(req, 'redirect_dict'):
0540             visited = new.redirect_dict = req.redirect_dict
0541             if (visited.get(newurl, 0) >= self.max_repeats or
0542                 len(visited) >= self.max_redirections):
0543                 raise HTTPError(req.get_full_url(), code,
0544                                 self.inf_msg + msg, headers, fp)
0545         else:
0546             visited = new.redirect_dict = req.redirect_dict = {}
0547         visited[newurl] = visited.get(newurl, 0) + 1
0548 
0549         # Don't close the fp until we are sure that we won't use it
0550         # with HTTPError.
0551         fp.read()
0552         fp.close()
0553 
0554         return self.parent.open(new)
0555 
0556     http_error_301 = http_error_303 = http_error_307 = http_error_302
0557 
0558     inf_msg = "The HTTP server returned a redirect error that would " \
0559               "lead to an infinite loop.\n" \
0560               "The last 30x error message was:\n"
0561 
0562 class ProxyHandler(BaseHandler):
0563     # Proxies must be in front
0564     handler_order = 100
0565 
0566     def __init__(self, proxies=None):
0567         if proxies is None:
0568             proxies = getproxies()
0569         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
0570         self.proxies = proxies
0571         for type, url in proxies.items():
0572             setattr(self, '%s_open' % type,
0573                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
0574                     meth(r, proxy, type))
0575 
0576     def proxy_open(self, req, proxy, type):
0577         orig_type = req.get_type()
0578         type, r_type = splittype(proxy)
0579         host, XXX = splithost(r_type)
0580         if '@' in host:
0581             user_pass, host = host.split('@', 1)
0582             if ':' in user_pass:
0583                 user, password = user_pass.split(':', 1)
0584                 user_pass = base64.encodestring('%s:%s' % (unquote(user),
0585                                                            unquote(password)))
0586                 req.add_header('Proxy-authorization', 'Basic ' + user_pass)
0587         host = unquote(host)
0588         req.set_proxy(host, type)
0589         if orig_type == type:
0590             # let other handlers take care of it
0591             # XXX this only makes sense if the proxy is before the
0592             # other handlers
0593             return None
0594         else:
0595             # need to start over, because the other handlers don't
0596             # grok the proxy's URL type
0597             return self.parent.open(req)
0598 
0599 # feature suggested by Duncan Booth
0600 # XXX custom is not a good name
0601 class CustomProxy:
0602     # either pass a function to the constructor or override handle
0603     def __init__(self, proto, func=None, proxy_addr=None):
0604         self.proto = proto
0605         self.func = func
0606         self.addr = proxy_addr
0607 
0608     def handle(self, req):
0609         if self.func and self.func(req):
0610             return 1
0611 
0612     def get_proxy(self):
0613         return self.addr
0614 
0615 class CustomProxyHandler(BaseHandler):
0616     # Proxies must be in front
0617     handler_order = 100
0618 
0619     def __init__(self, *proxies):
0620         self.proxies = {}
0621 
0622     def proxy_open(self, req):
0623         proto = req.get_type()
0624         try:
0625             proxies = self.proxies[proto]
0626         except KeyError:
0627             return None
0628         for p in proxies:
0629             if p.handle(req):
0630                 req.set_proxy(p.get_proxy())
0631                 return self.parent.open(req)
0632         return None
0633 
0634     def do_proxy(self, p, req):
0635         return self.parent.open(req)
0636 
0637     def add_proxy(self, cpo):
0638         if cpo.proto in self.proxies:
0639             self.proxies[cpo.proto].append(cpo)
0640         else:
0641             self.proxies[cpo.proto] = [cpo]
0642 
0643 class HTTPPasswordMgr:
0644     def __init__(self):
0645         self.passwd = {}
0646 
0647     def add_password(self, realm, uri, user, passwd):
0648         # uri could be a single URI or a sequence
0649         if isinstance(uri, basestring):
0650             uri = [uri]
0651         uri = tuple(map(self.reduce_uri, uri))
0652         if not realm in self.passwd:
0653             self.passwd[realm] = {}
0654         self.passwd[realm][uri] = (user, passwd)
0655 
0656     def find_user_password(self, realm, authuri):
0657         domains = self.passwd.get(realm, {})
0658         authuri = self.reduce_uri(authuri)
0659         for uris, authinfo in domains.iteritems():
0660             for uri in uris:
0661                 if self.is_suburi(uri, authuri):
0662                     return authinfo
0663         return None, None
0664 
0665     def reduce_uri(self, uri):
0666         """Accept netloc or URI and extract only the netloc and path"""
0667         parts = urlparse.urlparse(uri)
0668         if parts[1]:
0669             return parts[1], parts[2] or '/'
0670         else:
0671             return parts[2], '/'
0672 
0673     def is_suburi(self, base, test):
0674         """Check if test is below base in a URI tree
0675 
0676         Both args must be URIs in reduced form.
0677         """
0678         if base == test:
0679             return True
0680         if base[0] != test[0]:
0681             return False
0682         common = posixpath.commonprefix((base[1], test[1]))
0683         if len(common) == len(base[1]):
0684             return True
0685         return False
0686 
0687 
0688 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
0689 
0690     def find_user_password(self, realm, authuri):
0691         user, password = HTTPPasswordMgr.find_user_password(self, realm,
0692                                                             authuri)
0693         if user is not None:
0694             return user, password
0695         return HTTPPasswordMgr.find_user_password(self, None, authuri)
0696 
0697 
0698 class AbstractBasicAuthHandler:
0699 
0700     rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
0701 
0702     # XXX there can actually be multiple auth-schemes in a
0703     # www-authenticate header.  should probably be a lot more careful
0704     # in parsing them to extract multiple alternatives
0705 
0706     def __init__(self, password_mgr=None):
0707         if password_mgr is None:
0708             password_mgr = HTTPPasswordMgr()
0709         self.passwd = password_mgr
0710         self.add_password = self.passwd.add_password
0711 
0712     def http_error_auth_reqed(self, authreq, host, req, headers):
0713         # XXX could be multiple headers
0714         authreq = headers.get(authreq, None)
0715         if authreq:
0716             mo = AbstractBasicAuthHandler.rx.search(authreq)
0717             if mo:
0718                 scheme, realm = mo.groups()
0719                 if scheme.lower() == 'basic':
0720                     return self.retry_http_basic_auth(host, req, realm)
0721 
0722     def retry_http_basic_auth(self, host, req, realm):
0723         user,pw = self.passwd.find_user_password(realm, host)
0724         if pw is not None:
0725             raw = "%s:%s" % (user, pw)
0726             auth = 'Basic %s' % base64.encodestring(raw).strip()
0727             if req.headers.get(self.auth_header, None) == auth:
0728                 return None
0729             req.add_header(self.auth_header, auth)
0730             return self.parent.open(req)
0731         else:
0732             return None
0733 
0734 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
0735 
0736     auth_header = 'Authorization'
0737 
0738     def http_error_401(self, req, fp, code, msg, headers):
0739         host = urlparse.urlparse(req.get_full_url())[1]
0740         return self.http_error_auth_reqed('www-authenticate',
0741                                           host, req, headers)
0742 
0743 
0744 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
0745 
0746     auth_header = 'Proxy-authorization'
0747 
0748     def http_error_407(self, req, fp, code, msg, headers):
0749         host = req.get_host()
0750         return self.http_error_auth_reqed('proxy-authenticate',
0751                                           host, req, headers)
0752 
0753 
0754 def randombytes(n):
0755     """Return n random bytes."""
0756     # Use /dev/urandom if it is available.  Fall back to random module
0757     # if not.  It might be worthwhile to extend this function to use
0758     # other platform-specific mechanisms for getting random bytes.
0759     if os.path.exists("/dev/urandom"):
0760         f = open("/dev/urandom")
0761         s = f.read(n)
0762         f.close()
0763         return s
0764     else:
0765         L = [chr(random.randrange(0, 256)) for i in range(n)]
0766         return "".join(L)
0767 
0768 class AbstractDigestAuthHandler:
0769     # Digest authentication is specified in RFC 2617.
0770 
0771     # XXX The client does not inspect the Authentication-Info header
0772     # in a successful response.
0773 
0774     # XXX It should be possible to test this implementation against
0775     # a mock server that just generates a static set of challenges.
0776 
0777     # XXX qop="auth-int" supports is shaky
0778 
0779     def __init__(self, passwd=None):
0780         if passwd is None:
0781             passwd = HTTPPasswordMgr()
0782         self.passwd = passwd
0783         self.add_password = self.passwd.add_password
0784         self.retried = 0
0785         self.nonce_count = 0
0786 
0787     def reset_retry_count(self):
0788         self.retried = 0
0789 
0790     def http_error_auth_reqed(self, auth_header, host, req, headers):
0791         authreq = headers.get(auth_header, None)
0792         if self.retried > 5:
0793             # Don't fail endlessly - if we failed once, we'll probably
0794             # fail a second time. Hm. Unless the Password Manager is
0795             # prompting for the information. Crap. This isn't great
0796             # but it's better than the current 'repeat until recursion
0797             # depth exceeded' approach <wink>
0798             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
0799                             headers, None)
0800         else:
0801             self.retried += 1
0802         if authreq:
0803             scheme = authreq.split()[0]
0804             if scheme.lower() == 'digest':
0805                 return self.retry_http_digest_auth(req, authreq)
0806             else:
0807                 raise ValueError("AbstractDigestAuthHandler doesn't know "
0808                                  "about %s"%(scheme))
0809 
0810     def retry_http_digest_auth(self, req, auth):
0811         token, challenge = auth.split(' ', 1)
0812         chal = parse_keqv_list(parse_http_list(challenge))
0813         auth = self.get_authorization(req, chal)
0814         if auth:
0815             auth_val = 'Digest %s' % auth
0816             if req.headers.get(self.auth_header, None) == auth_val:
0817                 return None
0818             req.add_header(self.auth_header, auth_val)
0819             resp = self.parent.open(req)
0820             return resp
0821 
0822     def get_cnonce(self, nonce):
0823         # The cnonce-value is an opaque
0824         # quoted string value provided by the client and used by both client
0825         # and server to avoid chosen plaintext attacks, to provide mutual
0826         # authentication, and to provide some message integrity protection.
0827         # This isn't a fabulous effort, but it's probably Good Enough.
0828         dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
0829                                        randombytes(8))).hexdigest()
0830         return dig[:16]
0831 
0832     def get_authorization(self, req, chal):
0833         try:
0834             realm = chal['realm']
0835             nonce = chal['nonce']
0836             qop = chal.get('qop')
0837             algorithm = chal.get('algorithm', 'MD5')
0838             # mod_digest doesn't send an opaque, even though it isn't
0839             # supposed to be optional
0840             opaque = chal.get('opaque', None)
0841         except KeyError:
0842             return None
0843 
0844         H, KD = self.get_algorithm_impls(algorithm)
0845         if H is None:
0846             return None
0847 
0848         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
0849         if user is None:
0850             return None
0851 
0852         # XXX not implemented yet
0853         if req.has_data():
0854             entdig = self.get_entity_digest(req.get_data(), chal)
0855         else:
0856             entdig = None
0857 
0858         A1 = "%s:%s:%s" % (user, realm, pw)
0859         A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
0860                         # XXX selector: what about proxies and full urls
0861                         req.get_selector())
0862         if qop == 'auth':
0863             self.nonce_count += 1
0864             ncvalue = '%08x' % self.nonce_count
0865             cnonce = self.get_cnonce(nonce)
0866             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
0867             respdig = KD(H(A1), noncebit)
0868         elif qop is None:
0869             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
0870         else:
0871             # XXX handle auth-int.
0872             pass
0873 
0874         # XXX should the partial digests be encoded too?
0875 
0876         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
0877                'response="%s"' % (user, realm, nonce, req.get_selector(),
0878                                   respdig)
0879         if opaque:
0880             base = base + ', opaque="%s"' % opaque
0881         if entdig:
0882             base = base + ', digest="%s"' % entdig
0883         if algorithm != 'MD5':
0884             base = base + ', algorithm="%s"' % algorithm
0885         if qop:
0886             base = base + ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
0887         return base
0888 
0889     def get_algorithm_impls(self, algorithm):
0890         # lambdas assume digest modules are imported at the top level
0891         if algorithm == 'MD5':
0892             H = lambda x: md5.new(x).hexdigest()
0893         elif algorithm == 'SHA':
0894             H = lambda x: sha.new(x).hexdigest()
0895         # XXX MD5-sess
0896         KD = lambda s, d: H("%s:%s" % (s, d))
0897         return H, KD
0898 
0899     def get_entity_digest(self, data, chal):
0900         # XXX not implemented yet
0901         return None
0902 
0903 
0904 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
0905     """An authentication protocol defined by RFC 2069
0906 
0907     Digest authentication improves on basic authentication because it
0908     does not transmit passwords in the clear.
0909     """
0910 
0911     auth_header = 'Authorization'
0912 
0913     def http_error_401(self, req, fp, code, msg, headers):
0914         host = urlparse.urlparse(req.get_full_url())[1]
0915         retry = self.http_error_auth_reqed('www-authenticate',
0916                                            host, req, headers)
0917         self.reset_retry_count()
0918         return retry
0919 
0920 
0921 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
0922 
0923     auth_header = 'Proxy-Authorization'
0924 
0925     def http_error_407(self, req, fp, code, msg, headers):
0926         host = req.get_host()
0927         retry = self.http_error_auth_reqed('proxy-authenticate',
0928                                            host, req, headers)
0929         self.reset_retry_count()
0930         return retry
0931 
0932 class AbstractHTTPHandler(BaseHandler):
0933 
0934     def __init__(self, debuglevel=0):
0935         self._debuglevel = debuglevel
0936 
0937     def set_http_debuglevel(self, level):
0938         self._debuglevel = level
0939 
0940     def do_request_(self, request):
0941         host = request.get_host()
0942         if not host:
0943             raise URLError('no host given')
0944 
0945         if request.has_data():  # POST
0946             data = request.get_data()
0947             if not request.has_header('Content-type'):
0948                 request.add_unredirected_header(
0949                     'Content-type',
0950                     'application/x-www-form-urlencoded')
0951             if not request.has_header('Content-length'):
0952                 request.add_unredirected_header(
0953                     'Content-length', '%d' % len(data))
0954 
0955         scheme, sel = splittype(request.get_selector())
0956         sel_host, sel_path = splithost(sel)
0957         if not request.has_header('Host'):
0958             request.add_unredirected_header('Host', sel_host or host)
0959         for name, value in self.parent.addheaders:
0960             name = name.capitalize()
0961             if not request.has_header(name):
0962                 request.add_unredirected_header(name, value)
0963 
0964         return request
0965 
0966     def do_open(self, http_class, req):
0967         """Return an addinfourl object for the request, using http_class.
0968 
0969         http_class must implement the HTTPConnection API from httplib.
0970         The addinfourl return value is a file-like object.  It also
0971         has methods and attributes including:
0972             - info(): return a mimetools.Message object for the headers
0973             - geturl(): return the original request URL
0974             - code: HTTP status code
0975         """
0976         host = req.get_host()
0977         if not host:
0978             raise URLError('no host given')
0979 
0980         h = http_class(host) # will parse host:port
0981         h.set_debuglevel(self._debuglevel)
0982 
0983         headers = dict(req.headers)
0984         headers.update(req.unredirected_hdrs)
0985         # We want to make an HTTP/1.1 request, but the addinfourl
0986         # class isn't prepared to deal with a persistent connection.
0987         # It will try to read all remaining data from the socket,
0988         # which will block while the server waits for the next request.
0989         # So make sure the connection gets closed after the (only)
0990         # request.
0991         headers["Connection"] = "close"
0992         try:
0993             h.request(req.get_method(), req.get_selector(), req.data, headers)
0994             r = h.getresponse()
0995         except socket.error, err: # XXX what error?
0996             raise URLError(err)
0997 
0998         # Pick apart the HTTPResponse object to get the addinfourl
0999         # object initialized properly.
1000 
1001         # Wrap the HTTPResponse object in socket's file object adapter
1002         # for Windows.  That adapter calls recv(), so delegate recv()
1003         # to read().  This weird wrapping allows the returned object to
1004         # have readline() and readlines() methods.
1005 
1006         # XXX It might be better to extract the read buffering code
1007         # out of socket._fileobject() and into a base class.
1008 
1009         r.recv = r.read
1010         fp = socket._fileobject(r)
1011 
1012         resp = addinfourl(fp, r.msg, req.get_full_url())
1013         resp.code = r.status
1014         resp.msg = r.reason
1015         return resp
1016 
1017 
1018 class HTTPHandler(AbstractHTTPHandler):
1019 
1020     def http_open(self, req):
1021         return self.do_open(httplib.HTTPConnection, req)
1022 
1023     http_request = AbstractHTTPHandler.do_request_
1024 
1025 if hasattr(httplib, 'HTTPS'):
1026     class HTTPSHandler(AbstractHTTPHandler):
1027 
1028         def https_open(self, req):
1029             return self.do_open(httplib.HTTPSConnection, req)
1030 
1031         https_request = AbstractHTTPHandler.do_request_
1032 
1033 class HTTPCookieProcessor(BaseHandler):
1034     def __init__(self, cookiejar=None):
1035         if cookiejar is None:
1036             cookiejar = cookielib.CookieJar()
1037         self.cookiejar = cookiejar
1038 
1039     def http_request(self, request):
1040         self.cookiejar.add_cookie_header(request)
1041         return request
1042 
1043     def http_response(self, request, response):
1044         self.cookiejar.extract_cookies(response, request)
1045         return response
1046 
1047     https_request = http_request
1048     https_response = http_response
1049 
1050 class UnknownHandler(BaseHandler):
1051     def unknown_open(self, req):
1052         type = req.get_type()
1053         raise URLError('unknown url type: %s' % type)
1054 
1055 def parse_keqv_list(l):
1056     """Parse list of key=value strings where keys are not duplicated."""
1057     parsed = {}
1058     for elt in l:
1059         k, v = elt.split('=', 1)
1060         if v[0] == '"' and v[-1] == '"':
1061             v = v[1:-1]
1062         parsed[k] = v
1063     return parsed
1064 
1065 def parse_http_list(s):
1066     """Parse lists as described by RFC 2068 Section 2.
1067 
1068     In particular, parse comma-separated lists where the elements of
1069     the list may include quoted-strings.  A quoted-string could
1070     contain a comma.
1071     """
1072     # XXX this function could probably use more testing
1073 
1074     list = []
1075     end = len(s)
1076     i = 0
1077     inquote = 0
1078     start = 0
1079     while i < end:
1080         cur = s[i:]
1081         c = cur.find(',')
1082         q = cur.find('"')
1083         if c == -1:
1084             list.append(s[start:])
1085             break
1086         if q == -1:
1087             if inquote:
1088                 raise ValueError, "unbalanced quotes"
1089             else:
1090                 list.append(s[start:i+c])
1091                 i = i + c + 1
1092                 continue
1093         if inquote:
1094             if q < c:
1095                 list.append(s[start:i+c])
1096                 i = i + c + 1
1097                 start = i
1098                 inquote = 0
1099             else:
1100                 i = i + q
1101         else:
1102             if c < q:
1103                 list.append(s[start:i+c])
1104                 i = i + c + 1
1105                 start = i
1106             else:
1107                 inquote = 1
1108                 i = i + q + 1
1109     return map(lambda x: x.strip(), list)
1110 
1111 class FileHandler(BaseHandler):
1112     # Use local file or FTP depending on form of URL
1113     def file_open(self, req):
1114         url = req.get_selector()
1115         if url[:2] == '//' and url[2:3] != '/':
1116             req.type = 'ftp'
1117             return self.parent.open(req)
1118         else:
1119             return self.open_local_file(req)
1120 
1121     # names for the localhost
1122     names = None
1123     def get_names(self):
1124         if FileHandler.names is None:
1125             FileHandler.names = (socket.gethostbyname('localhost'),
1126                                  socket.gethostbyname(socket.gethostname()))
1127         return FileHandler.names
1128 
1129     # not entirely sure what the rules are here
1130     def open_local_file(self, req):
1131         import email.Utils
1132         host = req.get_host()
1133         file = req.get_selector()
1134         localfile = url2pathname(file)
1135         stats = os.stat(localfile)
1136         size = stats.st_size
1137         modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
1138         mtype = mimetypes.guess_type(file)[0]
1139         headers = mimetools.Message(StringIO(
1140             'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1141             (mtype or 'text/plain', size, modified)))
1142         if host:
1143             host, port = splitport(host)
1144         if not host or \
1145            (not port and socket.gethostbyname(host) in self.get_names()):
1146             return addinfourl(open(localfile, 'rb'),
1147                               headers, 'file:'+file)
1148         raise URLError('file not on local host')
1149 
1150 class FTPHandler(BaseHandler):
1151     def ftp_open(self, req):
1152         host = req.get_host()
1153         if not host:
1154             raise IOError, ('ftp error', 'no host given')
1155         host, port = splitport(host)
1156         if port is None:
1157             port = ftplib.FTP_PORT
1158         else:
1159             port = int(port)
1160 
1161         # username/password handling
1162         user, host = splituser(host)
1163         if user:
1164             user, passwd = splitpasswd(user)
1165         else:
1166             passwd = None
1167         host = unquote(host)
1168         user = unquote(user or '')
1169         passwd = unquote(passwd or '')
1170 
1171         try:
1172             host = socket.gethostbyname(host)
1173         except socket.error, msg:
1174             raise URLError(msg)
1175         path, attrs = splitattr(req.get_selector())
1176         dirs = path.split('/')
1177         dirs = map(unquote, dirs)
1178         dirs, file = dirs[:-1], dirs[-1]
1179         if dirs and not dirs[0]:
1180             dirs = dirs[1:]
1181         try:
1182             fw = self.connect_ftp(user, passwd, host, port, dirs)
1183             type = file and 'I' or 'D'
1184             for attr in attrs:
1185                 attr, value = splitvalue(attr)
1186                 if attr.lower() == 'type' and \
1187                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
1188                     type = value.upper()
1189             fp, retrlen = fw.retrfile(file, type)
1190             headers = ""
1191             mtype = mimetypes.guess_type(req.get_full_url())[0]
1192             if mtype:
1193                 headers += "Content-type: %s\n" % mtype
1194             if retrlen is not None and retrlen >= 0:
1195                 headers += "Content-length: %d\n" % retrlen
1196             sf = StringIO(headers)
1197             headers = mimetools.Message(sf)
1198             return addinfourl(fp, headers, req.get_full_url())
1199         except ftplib.all_errors, msg:
1200             raise IOError, ('ftp error', msg), sys.exc_info()[2]
1201 
1202     def connect_ftp(self, user, passwd, host, port, dirs):
1203         fw = ftpwrapper(user, passwd, host, port, dirs)
1204 ##        fw.ftp.set_debuglevel(1)
1205         return fw
1206 
1207 class CacheFTPHandler(FTPHandler):
1208     # XXX would be nice to have pluggable cache strategies
1209     # XXX this stuff is definitely not thread safe
1210     def __init__(self):
1211         self.cache = {}
1212         self.timeout = {}
1213         self.soonest = 0
1214         self.delay = 60
1215         self.max_conns = 16
1216 
1217     def setTimeout(self, t):
1218         self.delay = t
1219 
1220     def setMaxConns(self, m):
1221         self.max_conns = m
1222 
1223     def connect_ftp(self, user, passwd, host, port, dirs):
1224         key = user, host, port, '/'.join(dirs)
1225         if key in self.cache:
1226             self.timeout[key] = time.time() + self.delay
1227         else:
1228             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1229             self.timeout[key] = time.time() + self.delay
1230         self.check_cache()
1231         return self.cache[key]
1232 
1233     def check_cache(self):
1234         # first check for old ones
1235         t = time.time()
1236         if self.soonest <= t:
1237             for k, v in self.timeout.items():
1238                 if v < t:
1239                     self.cache[k].close()
1240                     del self.cache[k]
1241                     del self.timeout[k]
1242         self.soonest = min(self.timeout.values())
1243 
1244         # then check the size
1245         if len(self.cache) == self.max_conns:
1246             for k, v in self.timeout.items():
1247                 if v == self.soonest:
1248                     del self.cache[k]
1249                     del self.timeout[k]
1250                     break
1251             self.soonest = min(self.timeout.values())
1252 
1253 class GopherHandler(BaseHandler):
1254     def gopher_open(self, req):
1255         host = req.get_host()
1256         if not host:
1257             raise GopherError('no host given')
1258         host = unquote(host)
1259         selector = req.get_selector()
1260         type, selector = splitgophertype(selector)
1261         selector, query = splitquery(selector)
1262         selector = unquote(selector)
1263         if query:
1264             query = unquote(query)
1265             fp = gopherlib.send_query(selector, query, host)
1266         else:
1267             fp = gopherlib.send_selector(selector, host)
1268         return addinfourl(fp, noheaders(), req.get_full_url())
1269 
1270 #bleck! don't use this yet
1271 class OpenerFactory:
1272 
1273     default_handlers = [UnknownHandler, HTTPHandler,
1274                         HTTPDefaultErrorHandler, HTTPRedirectHandler,
1275                         FTPHandler, FileHandler]
1276     handlers = []
1277     replacement_handlers = []
1278 
1279     def add_handler(self, h):
1280         self.handlers = self.handlers + [h]
1281 
1282     def replace_handler(self, h):
1283         pass
1284 
1285     def build_opener(self):
1286         opener = OpenerDirector()
1287         for ph in self.default_handlers:
1288             if inspect.isclass(ph):
1289                 ph = ph()
1290             opener.add_handler(ph)
1291
Generated by PyXR 0.9.4