PyXR

c:\python24\lib \ cookielib.py



0001 """HTTP cookie handling for web clients.
0002 
0003 This module has (now fairly distant) origins in Gisle Aas' Perl module
0004 HTTP::Cookies, from the libwww-perl library.
0005 
0006 Docstrings, comments and debug strings in this code refer to the
0007 attributes of the HTTP cookie system as cookie-attributes, to distinguish
0008 them clearly from Python attributes.
0009 
0010 Class diagram (note that the classes which do not derive from
0011 FileCookieJar are not distributed with the Python standard library, but
0012 are available from http://wwwsearch.sf.net/):
0013 
0014                         CookieJar____
0015                         /     \      \
0016             FileCookieJar      \      \
0017              /    |   \         \      \
0018  MozillaCookieJar | LWPCookieJar \      \
0019                   |               |      \
0020                   |   ---MSIEBase |       \
0021                   |  /      |     |        \
0022                   | /   MSIEDBCookieJar BSDDBCookieJar
0023                   |/
0024                MSIECookieJar
0025 
0026 """
0027 
0028 import sys, re, urlparse, copy, time, urllib, logging
0029 from types import StringTypes
0030 try:
0031     import threading as _threading
0032 except ImportError:
0033     import dummy_threading as _threading
0034 import httplib  # only for the default HTTP port
0035 from calendar import timegm
0036 
0037 debug = logging.getLogger("cookielib").debug
0038 
0039 DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
0040 MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
0041                          "instance initialised with one)")
0042 
0043 def reraise_unmasked_exceptions(unmasked=()):
0044     # There are a few catch-all except: statements in this module, for
0045     # catching input that's bad in unexpected ways.
0046     # This function re-raises some exceptions we don't want to trap.
0047     unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
0048     etype = sys.exc_info()[0]
0049     if issubclass(etype, unmasked):
0050         raise
0051     # swallowed an exception
0052     import warnings, traceback, StringIO
0053     f = StringIO.StringIO()
0054     traceback.print_exc(None, f)
0055     msg = f.getvalue()
0056     warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
0057 
0058 
0059 # Date/time conversion
0060 # -----------------------------------------------------------------------------
0061 
0062 EPOCH_YEAR = 1970
0063 def _timegm(tt):
0064     year, month, mday, hour, min, sec = tt[:6]
0065     if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
0066         (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
0067         return timegm(tt)
0068     else:
0069         return None
0070 
0071 DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
0072 MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
0073           "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
0074 MONTHS_LOWER = []
0075 for month in MONTHS: MONTHS_LOWER.append(month.lower())
0076 
0077 def time2isoz(t=None):
0078     """Return a string representing time in seconds since epoch, t.
0079 
0080     If the function is called without an argument, it will use the current
0081     time.
0082 
0083     The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
0084     representing Universal Time (UTC, aka GMT).  An example of this format is:
0085 
0086     1994-11-24 08:49:37Z
0087 
0088     """
0089     if t is None: t = time.time()
0090     year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
0091     return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
0092         year, mon, mday, hour, min, sec)
0093 
0094 def time2netscape(t=None):
0095     """Return a string representing time in seconds since epoch, t.
0096 
0097     If the function is called without an argument, it will use the current
0098     time.
0099 
0100     The format of the returned string is like this:
0101 
0102     Wed, DD-Mon-YYYY HH:MM:SS GMT
0103 
0104     """
0105     if t is None: t = time.time()
0106     year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
0107     return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
0108         DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
0109 
0110 
0111 UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
0112 
0113 TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
0114 def offset_from_tz_string(tz):
0115     offset = None
0116     if tz in UTC_ZONES:
0117         offset = 0
0118     else:
0119         m = TIMEZONE_RE.search(tz)
0120         if m:
0121             offset = 3600 * int(m.group(2))
0122             if m.group(3):
0123                 offset = offset + 60 * int(m.group(3))
0124             if m.group(1) == '-':
0125                 offset = -offset
0126     return offset
0127 
0128 def _str2time(day, mon, yr, hr, min, sec, tz):
0129     # translate month name to number
0130     # month numbers start with 1 (January)
0131     try:
0132         mon = MONTHS_LOWER.index(mon.lower())+1
0133     except ValueError:
0134         # maybe it's already a number
0135         try:
0136             imon = int(mon)
0137         except ValueError:
0138             return None
0139         if 1 <= imon <= 12:
0140             mon = imon
0141         else:
0142             return None
0143 
0144     # make sure clock elements are defined
0145     if hr is None: hr = 0
0146     if min is None: min = 0
0147     if sec is None: sec = 0
0148 
0149     yr = int(yr)
0150     day = int(day)
0151     hr = int(hr)
0152     min = int(min)
0153     sec = int(sec)
0154 
0155     if yr < 1000:
0156         # find "obvious" year
0157         cur_yr = time.localtime(time.time())[0]
0158         m = cur_yr % 100
0159         tmp = yr
0160         yr = yr + cur_yr - m
0161         m = m - tmp
0162         if abs(m) > 50:
0163             if m > 0: yr = yr + 100
0164             else: yr = yr - 100
0165 
0166     # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
0167     t = _timegm((yr, mon, day, hr, min, sec, tz))
0168 
0169     if t is not None:
0170         # adjust time using timezone string, to get absolute time since epoch
0171         if tz is None:
0172             tz = "UTC"
0173         tz = tz.upper()
0174         offset = offset_from_tz_string(tz)
0175         if offset is None:
0176             return None
0177         t = t - offset
0178 
0179     return t
0180 
0181 STRICT_DATE_RE = re.compile(
0182     r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
0183     "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
0184 WEEKDAY_RE = re.compile(
0185     r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
0186 LOOSE_HTTP_DATE_RE = re.compile(
0187     r"""^
0188     (\d\d?)            # day
0189        (?:\s+|[-\/])
0190     (\w+)              # month
0191         (?:\s+|[-\/])
0192     (\d+)              # year
0193     (?:
0194           (?:\s+|:)    # separator before clock
0195        (\d\d?):(\d\d)  # hour:min
0196        (?::(\d\d))?    # optional seconds
0197     )?                 # optional clock
0198        \s*
0199     ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
0200        \s*
0201     (?:\(\w+\))?       # ASCII representation of timezone in parens.
0202        \s*$""", re.X)
0203 def http2time(text):
0204     """Returns time in seconds since epoch of time represented by a string.
0205 
0206     Return value is an integer.
0207 
0208     None is returned if the format of str is unrecognized, the time is outside
0209     the representable range, or the timezone string is not recognized.  If the
0210     string contains no timezone, UTC is assumed.
0211 
0212     The timezone in the string may be numerical (like "-0800" or "+0100") or a
0213     string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
0214     timezone strings equivalent to UTC (zero offset) are known to the function.
0215 
0216     The function loosely parses the following formats:
0217 
0218     Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
0219     Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
0220     Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
0221     09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
0222     08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
0223     08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
0224 
0225     The parser ignores leading and trailing whitespace.  The time may be
0226     absent.
0227 
0228     If the year is given with only 2 digits, the function will select the
0229     century that makes the year closest to the current date.
0230 
0231     """
0232     # fast exit for strictly conforming string
0233     m = STRICT_DATE_RE.search(text)
0234     if m:
0235         g = m.groups()
0236         mon = MONTHS_LOWER.index(g[1].lower()) + 1
0237         tt = (int(g[2]), mon, int(g[0]),
0238               int(g[3]), int(g[4]), float(g[5]))
0239         return _timegm(tt)
0240 
0241     # No, we need some messy parsing...
0242 
0243     # clean up
0244     text = text.lstrip()
0245     text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
0246 
0247     # tz is time zone specifier string
0248     day, mon, yr, hr, min, sec, tz = [None]*7
0249 
0250     # loose regexp parse
0251     m = LOOSE_HTTP_DATE_RE.search(text)
0252     if m is not None:
0253         day, mon, yr, hr, min, sec, tz = m.groups()
0254     else:
0255         return None  # bad format
0256 
0257     return _str2time(day, mon, yr, hr, min, sec, tz)
0258 
0259 ISO_DATE_RE = re.compile(
0260     """^
0261     (\d{4})              # year
0262        [-\/]?
0263     (\d\d?)              # numerical month
0264        [-\/]?
0265     (\d\d?)              # day
0266    (?:
0267          (?:\s+|[-:Tt])  # separator before clock
0268       (\d\d?):?(\d\d)    # hour:min
0269       (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
0270    )?                    # optional clock
0271       \s*
0272    ([-+]?\d\d?:?(:?\d\d)?
0273     |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
0274       \s*$""", re.X)
0275 def iso2time(text):
0276     """
0277     As for http2time, but parses the ISO 8601 formats:
0278 
0279     1994-02-03 14:15:29 -0100    -- ISO 8601 format
0280     1994-02-03 14:15:29          -- zone is optional
0281     1994-02-03                   -- only date
0282     1994-02-03T14:15:29          -- Use T as separator
0283     19940203T141529Z             -- ISO 8601 compact format
0284     19940203                     -- only date
0285 
0286     """
0287     # clean up
0288     text = text.lstrip()
0289 
0290     # tz is time zone specifier string
0291     day, mon, yr, hr, min, sec, tz = [None]*7
0292 
0293     # loose regexp parse
0294     m = ISO_DATE_RE.search(text)
0295     if m is not None:
0296         # XXX there's an extra bit of the timezone I'm ignoring here: is
0297         #   this the right thing to do?
0298         yr, mon, day, hr, min, sec, tz, _ = m.groups()
0299     else:
0300         return None  # bad format
0301 
0302     return _str2time(day, mon, yr, hr, min, sec, tz)
0303 
0304 
0305 # Header parsing
0306 # -----------------------------------------------------------------------------
0307 
0308 def unmatched(match):
0309     """Return unmatched part of re.Match object."""
0310     start, end = match.span(0)
0311     return match.string[:start]+match.string[end:]
0312 
0313 HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
0314 HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
0315 HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
0316 HEADER_ESCAPE_RE = re.compile(r"\\(.)")
0317 def split_header_words(header_values):
0318     r"""Parse header values into a list of lists containing key,value pairs.
0319 
0320     The function knows how to deal with ",", ";" and "=" as well as quoted
0321     values after "=".  A list of space separated tokens are parsed as if they
0322     were separated by ";".
0323 
0324     If the header_values passed as argument contains multiple values, then they
0325     are treated as if they were a single value separated by comma ",".
0326 
0327     This means that this function is useful for parsing header fields that
0328     follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
0329     the requirement for tokens).
0330 
0331       headers           = #header
0332       header            = (token | parameter) *( [";"] (token | parameter))
0333 
0334       token             = 1*<any CHAR except CTLs or separators>
0335       separators        = "(" | ")" | "<" | ">" | "@"
0336                         | "," | ";" | ":" | "\" | <">
0337                         | "/" | "[" | "]" | "?" | "="
0338                         | "{" | "}" | SP | HT
0339 
0340       quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
0341       qdtext            = <any TEXT except <">>
0342       quoted-pair       = "\" CHAR
0343 
0344       parameter         = attribute "=" value
0345       attribute         = token
0346       value             = token | quoted-string
0347 
0348     Each header is represented by a list of key/value pairs.  The value for a
0349     simple token (not part of a parameter) is None.  Syntactically incorrect
0350     headers will not necessarily be parsed as you would want.
0351 
0352     This is easier to describe with some examples:
0353 
0354     >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
0355     [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
0356     >>> split_header_words(['text/html; charset="iso-8859-1"'])
0357     [[('text/html', None), ('charset', 'iso-8859-1')]]
0358     >>> split_header_words([r'Basic realm="\"foo\bar\""'])
0359     [[('Basic', None), ('realm', '"foobar"')]]
0360 
0361     """
0362     assert type(header_values) not in StringTypes
0363     result = []
0364     for text in header_values:
0365         orig_text = text
0366         pairs = []
0367         while text:
0368             m = HEADER_TOKEN_RE.search(text)
0369             if m:
0370                 text = unmatched(m)
0371                 name = m.group(1)
0372                 m = HEADER_QUOTED_VALUE_RE.search(text)
0373                 if m:  # quoted value
0374                     text = unmatched(m)
0375                     value = m.group(1)
0376                     value = HEADER_ESCAPE_RE.sub(r"\1", value)
0377                 else:
0378                     m = HEADER_VALUE_RE.search(text)
0379                     if m:  # unquoted value
0380                         text = unmatched(m)
0381                         value = m.group(1)
0382                         value = value.rstrip()
0383                     else:
0384                         # no value, a lone token
0385                         value = None
0386                 pairs.append((name, value))
0387             elif text.lstrip().startswith(","):
0388                 # concatenated headers, as per RFC 2616 section 4.2
0389                 text = text.lstrip()[1:]
0390                 if pairs: result.append(pairs)
0391                 pairs = []
0392             else:
0393                 # skip junk
0394                 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
0395                 assert nr_junk_chars > 0, (
0396                     "split_header_words bug: '%s', '%s', %s" %
0397                     (orig_text, text, pairs))
0398                 text = non_junk
0399         if pairs: result.append(pairs)
0400     return result
0401 
0402 HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
0403 def join_header_words(lists):
0404     """Do the inverse (almost) of the conversion done by split_header_words.
0405 
0406     Takes a list of lists of (key, value) pairs and produces a single header
0407     value.  Attribute values are quoted if needed.
0408 
0409     >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
0410     'text/plain; charset="iso-8859/1"'
0411     >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
0412     'text/plain, charset="iso-8859/1"'
0413 
0414     """
0415     headers = []
0416     for pairs in lists:
0417         attr = []
0418         for k, v in pairs:
0419             if v is not None:
0420                 if not re.search(r"^\w+$", v):
0421                     v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
0422                     v = '"%s"' % v
0423                 k = "%s=%s" % (k, v)
0424             attr.append(k)
0425         if attr: headers.append("; ".join(attr))
0426     return ", ".join(headers)
0427 
0428 def parse_ns_headers(ns_headers):
0429     """Ad-hoc parser for Netscape protocol cookie-attributes.
0430 
0431     The old Netscape cookie format for Set-Cookie can for instance contain
0432     an unquoted "," in the expires field, so we have to use this ad-hoc
0433     parser instead of split_header_words.
0434 
0435     XXX This may not make the best possible effort to parse all the crap
0436     that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
0437     parser is probably better, so could do worse than following that if
0438     this ever gives any trouble.
0439 
0440     Currently, this is also used for parsing RFC 2109 cookies.
0441 
0442     """
0443     known_attrs = ("expires", "domain", "path", "secure",
0444                    # RFC 2109 attrs (may turn up in Netscape cookies, too)
0445                    "port", "max-age")
0446 
0447     result = []
0448     for ns_header in ns_headers:
0449         pairs = []
0450         version_set = False
0451         for param in re.split(r";\s*", ns_header):
0452             param = param.rstrip()
0453             if param == "": continue
0454             if "=" not in param:
0455                 if param.lower() in known_attrs:
0456                     k, v = param, None
0457                 else:
0458                     # cookie with missing value
0459                     k, v = param, None
0460             else:
0461                 k, v = re.split(r"\s*=\s*", param, 1)
0462                 k = k.lstrip()
0463             if k is not None:
0464                 lc = k.lower()
0465                 if lc in known_attrs:
0466                     k = lc
0467                 if k == "version":
0468                     # This is an RFC 2109 cookie.  Will be treated as RFC 2965
0469                     # cookie in rest of code.
0470                     # Probably it should be parsed with split_header_words, but
0471                     # that's too much hassle.
0472                     version_set = True
0473                 if k == "expires":
0474                     # convert expires date to seconds since epoch
0475                     if v.startswith('"'): v = v[1:]
0476                     if v.endswith('"'): v = v[:-1]
0477                     v = http2time(v)  # None if invalid
0478             pairs.append((k, v))
0479 
0480         if pairs:
0481             if not version_set:
0482                 pairs.append(("version", "0"))
0483             result.append(pairs)
0484 
0485     return result
0486 
0487 
0488 IPV4_RE = re.compile(r"\.\d+$")
0489 def is_HDN(text):
0490     """Return True if text is a host domain name."""
0491     # XXX
0492     # This may well be wrong.  Which RFC is HDN defined in, if any (for
0493     #  the purposes of RFC 2965)?
0494     # For the current implementation, what about IPv6?  Remember to look
0495     #  at other uses of IPV4_RE also, if change this.
0496     if IPV4_RE.search(text):
0497         return False
0498     if text == "":
0499         return False
0500     if text[0] == "." or text[-1] == ".":
0501         return False
0502     return True
0503 
0504 def domain_match(A, B):
0505     """Return True if domain A domain-matches domain B, according to RFC 2965.
0506 
0507     A and B may be host domain names or IP addresses.
0508 
0509     RFC 2965, section 1:
0510 
0511     Host names can be specified either as an IP address or a HDN string.
0512     Sometimes we compare one host name with another.  (Such comparisons SHALL
0513     be case-insensitive.)  Host A's name domain-matches host B's if
0514 
0515          *  their host name strings string-compare equal; or
0516 
0517          * A is a HDN string and has the form NB, where N is a non-empty
0518             name string, B has the form .B', and B' is a HDN string.  (So,
0519             x.y.com domain-matches .Y.com but not Y.com.)
0520 
0521     Note that domain-match is not a commutative operation: a.b.c.com
0522     domain-matches .c.com, but not the reverse.
0523 
0524     """
0525     # Note that, if A or B are IP addresses, the only relevant part of the
0526     # definition of the domain-match algorithm is the direct string-compare.
0527     A = A.lower()
0528     B = B.lower()
0529     if A == B:
0530         return True
0531     if not is_HDN(A):
0532         return False
0533     i = A.rfind(B)
0534     if i == -1 or i == 0:
0535         # A does not have form NB, or N is the empty string
0536         return False
0537     if not B.startswith("."):
0538         return False
0539     if not is_HDN(B[1:]):
0540         return False
0541     return True
0542 
0543 def liberal_is_HDN(text):
0544     """Return True if text is a sort-of-like a host domain name.
0545 
0546     For accepting/blocking domains.
0547 
0548     """
0549     if IPV4_RE.search(text):
0550         return False
0551     return True
0552 
0553 def user_domain_match(A, B):
0554     """For blocking/accepting domains.
0555 
0556     A and B may be host domain names or IP addresses.
0557 
0558     """
0559     A = A.lower()
0560     B = B.lower()
0561     if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
0562         if A == B:
0563             # equal IP addresses
0564             return True
0565         return False
0566     initial_dot = B.startswith(".")
0567     if initial_dot and A.endswith(B):
0568         return True
0569     if not initial_dot and A == B:
0570         return True
0571     return False
0572 
0573 cut_port_re = re.compile(r":\d+$")
0574 def request_host(request):
0575     """Return request-host, as defined by RFC 2965.
0576 
0577     Variation from RFC: returned value is lowercased, for convenient
0578     comparison.
0579 
0580     """
0581     url = request.get_full_url()
0582     host = urlparse.urlparse(url)[1]
0583     if host == "":
0584         host = request.get_header("Host", "")
0585 
0586     # remove port, if present
0587     host = cut_port_re.sub("", host, 1)
0588     return host.lower()
0589 
0590 def eff_request_host(request):
0591     """Return a tuple (request-host, effective request-host name).
0592 
0593     As defined by RFC 2965, except both are lowercased.
0594 
0595     """
0596     erhn = req_host = request_host(request)
0597     if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
0598         erhn = req_host + ".local"
0599     return req_host, erhn
0600 
0601 def request_path(request):
0602     """request-URI, as defined by RFC 2965."""
0603     url = request.get_full_url()
0604     #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
0605     #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
0606     path, parameters, query, frag = urlparse.urlparse(url)[2:]
0607     if parameters:
0608         path = "%s;%s" % (path, parameters)
0609     path = escape_path(path)
0610     req_path = urlparse.urlunparse(("", "", path, "", query, frag))
0611     if not req_path.startswith("/"):
0612         # fix bad RFC 2396 absoluteURI
0613         req_path = "/"+req_path
0614     return req_path
0615 
0616 def request_port(request):
0617     host = request.get_host()
0618     i = host.find(':')
0619     if i >= 0:
0620         port = host[i+1:]
0621         try:
0622             int(port)
0623         except ValueError:
0624             debug("nonnumeric port: '%s'", port)
0625             return None
0626     else:
0627         port = DEFAULT_HTTP_PORT
0628     return port
0629 
0630 # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
0631 # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
0632 HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
0633 ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
0634 def uppercase_escaped_char(match):
0635     return "%%%s" % match.group(1).upper()
0636 def escape_path(path):
0637     """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
0638     # There's no knowing what character encoding was used to create URLs
0639     # containing %-escapes, but since we have to pick one to escape invalid
0640     # path characters, we pick UTF-8, as recommended in the HTML 4.0
0641     # specification:
0642     # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
0643     # And here, kind of: draft-fielding-uri-rfc2396bis-03
0644     # (And in draft IRI specification: draft-duerst-iri-05)
0645     # (And here, for new URI schemes: RFC 2718)
0646     if isinstance(path, unicode):
0647         path = path.encode("utf-8")
0648     path = urllib.quote(path, HTTP_PATH_SAFE)
0649     path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
0650     return path
0651 
0652 def reach(h):
0653     """Return reach of host h, as defined by RFC 2965, section 1.
0654 
0655     The reach R of a host name H is defined as follows:
0656 
0657        *  If
0658 
0659           -  H is the host domain name of a host; and,
0660 
0661           -  H has the form A.B; and
0662 
0663           -  A has no embedded (that is, interior) dots; and
0664 
0665           -  B has at least one embedded dot, or B is the string "local".
0666              then the reach of H is .B.
0667 
0668        *  Otherwise, the reach of H is H.
0669 
0670     >>> reach("www.acme.com")
0671     '.acme.com'
0672     >>> reach("acme.com")
0673     'acme.com'
0674     >>> reach("acme.local")
0675     '.local'
0676 
0677     """
0678     i = h.find(".")
0679     if i >= 0:
0680         #a = h[:i]  # this line is only here to show what a is
0681         b = h[i+1:]
0682         i = b.find(".")
0683         if is_HDN(h) and (i >= 0 or b == "local"):
0684             return "."+b
0685     return h
0686 
0687 def is_third_party(request):
0688     """
0689 
0690     RFC 2965, section 3.3.6:
0691 
0692         An unverifiable transaction is to a third-party host if its request-
0693         host U does not domain-match the reach R of the request-host O in the
0694         origin transaction.
0695 
0696     """
0697     req_host = request_host(request)
0698     if not domain_match(req_host, reach(request.get_origin_req_host())):
0699         return True
0700     else:
0701         return False
0702 
0703 
0704 class Cookie:
0705     """HTTP Cookie.
0706 
0707     This class represents both Netscape and RFC 2965 cookies.
0708 
0709     This is deliberately a very simple class.  It just holds attributes.  It's
0710     possible to construct Cookie instances that don't comply with the cookie
0711     standards.  CookieJar.make_cookies is the factory function for Cookie
0712     objects -- it deals with cookie parsing, supplying defaults, and
0713     normalising to the representation used in this class.  CookiePolicy is
0714     responsible for checking them to see whether they should be accepted from
0715     and returned to the server.
0716 
0717     Note that the port may be present in the headers, but unspecified ("Port"
0718     rather than"Port=80", for example); if this is the case, port is None.
0719 
0720     """
0721 
0722     def __init__(self, version, name, value,
0723                  port, port_specified,
0724                  domain, domain_specified, domain_initial_dot,
0725                  path, path_specified,
0726                  secure,
0727                  expires,
0728                  discard,
0729                  comment,
0730                  comment_url,
0731                  rest):
0732 
0733         if version is not None: version = int(version)
0734         if expires is not None: expires = int(expires)
0735         if port is None and port_specified is True:
0736             raise ValueError("if port is None, port_specified must be false")
0737 
0738         self.version = version
0739         self.name = name
0740         self.value = value
0741         self.port = port
0742         self.port_specified = port_specified
0743         # normalise case, as per RFC 2965 section 3.3.3
0744         self.domain = domain.lower()
0745         self.domain_specified = domain_specified
0746         # Sigh.  We need to know whether the domain given in the
0747         # cookie-attribute had an initial dot, in order to follow RFC 2965
0748         # (as clarified in draft errata).  Needed for the returned $Domain
0749         # value.
0750         self.domain_initial_dot = domain_initial_dot
0751         self.path = path
0752         self.path_specified = path_specified
0753         self.secure = secure
0754         self.expires = expires
0755         self.discard = discard
0756         self.comment = comment
0757         self.comment_url = comment_url
0758 
0759         self._rest = copy.copy(rest)
0760 
0761     def has_nonstandard_attr(self, name):
0762         return name in self._rest
0763     def get_nonstandard_attr(self, name, default=None):
0764         return self._rest.get(name, default)
0765     def set_nonstandard_attr(self, name, value):
0766         self._rest[name] = value
0767 
0768     def is_expired(self, now=None):
0769         if now is None: now = time.time()
0770         if (self.expires is not None) and (self.expires <= now):
0771             return True
0772         return False
0773 
0774     def __str__(self):
0775         if self.port is None: p = ""
0776         else: p = ":"+self.port
0777         limit = self.domain + p + self.path
0778         if self.value is not None:
0779             namevalue = "%s=%s" % (self.name, self.value)
0780         else:
0781             namevalue = self.name
0782         return "<Cookie %s for %s>" % (namevalue, limit)
0783 
0784     def __repr__(self):
0785         args = []
0786         for name in ["version", "name", "value",
0787                      "port", "port_specified",
0788                      "domain", "domain_specified", "domain_initial_dot",
0789                      "path", "path_specified",
0790                      "secure", "expires", "discard", "comment", "comment_url",
0791                      ]:
0792             attr = getattr(self, name)
0793             args.append("%s=%s" % (name, repr(attr)))
0794         args.append("rest=%s" % repr(self._rest))
0795         return "Cookie(%s)" % ", ".join(args)
0796 
0797 
0798 class CookiePolicy:
0799     """Defines which cookies get accepted from and returned to server.
0800 
0801     May also modify cookies, though this is probably a bad idea.
0802 
0803     The subclass DefaultCookiePolicy defines the standard rules for Netscape
0804     and RFC 2965 cookies -- override that if you want a customised policy.
0805 
0806     """
0807     def set_ok(self, cookie, request):
0808         """Return true if (and only if) cookie should be accepted from server.
0809 
0810         Currently, pre-expired cookies never get this far -- the CookieJar
0811         class deletes such cookies itself.
0812 
0813         """
0814         raise NotImplementedError()
0815 
0816     def return_ok(self, cookie, request):
0817         """Return true if (and only if) cookie should be returned to server."""
0818         raise NotImplementedError()
0819 
0820     def domain_return_ok(self, domain, request):
0821         """Return false if cookies should not be returned, given cookie domain.
0822         """
0823         return True
0824 
0825     def path_return_ok(self, path, request):
0826         """Return false if cookies should not be returned, given cookie path.
0827         """
0828         return True
0829 
0830 
0831 class DefaultCookiePolicy(CookiePolicy):
0832     """Implements the standard rules for accepting and returning cookies."""
0833 
0834     DomainStrictNoDots = 1
0835     DomainStrictNonDomain = 2
0836     DomainRFC2965Match = 4
0837 
0838     DomainLiberal = 0
0839     DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
0840 
0841     def __init__(self,
0842                  blocked_domains=None, allowed_domains=None,
0843                  netscape=True, rfc2965=False,
0844                  hide_cookie2=False,
0845                  strict_domain=False,
0846                  strict_rfc2965_unverifiable=True,
0847                  strict_ns_unverifiable=False,
0848                  strict_ns_domain=DomainLiberal,
0849                  strict_ns_set_initial_dollar=False,
0850                  strict_ns_set_path=False,
0851                  ):
0852         """Constructor arguments should be passed as keyword arguments only."""
0853         self.netscape = netscape
0854         self.rfc2965 = rfc2965
0855         self.hide_cookie2 = hide_cookie2
0856         self.strict_domain = strict_domain
0857         self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
0858         self.strict_ns_unverifiable = strict_ns_unverifiable
0859         self.strict_ns_domain = strict_ns_domain
0860         self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
0861         self.strict_ns_set_path = strict_ns_set_path
0862 
0863         if blocked_domains is not None:
0864             self._blocked_domains = tuple(blocked_domains)
0865         else:
0866             self._blocked_domains = ()
0867 
0868         if allowed_domains is not None:
0869             allowed_domains = tuple(allowed_domains)
0870         self._allowed_domains = allowed_domains
0871 
0872     def blocked_domains(self):
0873         """Return the sequence of blocked domains (as a tuple)."""
0874         return self._blocked_domains
0875     def set_blocked_domains(self, blocked_domains):
0876         """Set the sequence of blocked domains."""
0877         self._blocked_domains = tuple(blocked_domains)
0878 
0879     def is_blocked(self, domain):
0880         for blocked_domain in self._blocked_domains:
0881             if user_domain_match(domain, blocked_domain):
0882                 return True
0883         return False
0884 
0885     def allowed_domains(self):
0886         """Return None, or the sequence of allowed domains (as a tuple)."""
0887         return self._allowed_domains
0888     def set_allowed_domains(self, allowed_domains):
0889         """Set the sequence of allowed domains, or None."""
0890         if allowed_domains is not None:
0891             allowed_domains = tuple(allowed_domains)
0892         self._allowed_domains = allowed_domains
0893 
0894     def is_not_allowed(self, domain):
0895         if self._allowed_domains is None:
0896             return False
0897         for allowed_domain in self._allowed_domains:
0898             if user_domain_match(domain, allowed_domain):
0899                 return False
0900         return True
0901 
0902     def set_ok(self, cookie, request):
0903         """
0904         If you override .set_ok(), be sure to call this method.  If it returns
0905         false, so should your subclass (assuming your subclass wants to be more
0906         strict about which cookies to accept).
0907 
0908         """
0909         debug(" - checking cookie %s=%s", cookie.name, cookie.value)
0910 
0911         assert cookie.name is not None
0912 
0913         for n in "version", "verifiability", "name", "path", "domain", "port":
0914             fn_name = "set_ok_"+n
0915             fn = getattr(self, fn_name)
0916             if not fn(cookie, request):
0917                 return False
0918 
0919         return True
0920 
0921     def set_ok_version(self, cookie, request):
0922         if cookie.version is None:
0923             # Version is always set to 0 by parse_ns_headers if it's a Netscape
0924             # cookie, so this must be an invalid RFC 2965 cookie.
0925             debug("   Set-Cookie2 without version attribute (%s=%s)",
0926                   cookie.name, cookie.value)
0927             return False
0928         if cookie.version > 0 and not self.rfc2965:
0929             debug("   RFC 2965 cookies are switched off")
0930             return False
0931         elif cookie.version == 0 and not self.netscape:
0932             debug("   Netscape cookies are switched off")
0933             return False
0934         return True
0935 
0936     def set_ok_verifiability(self, cookie, request):
0937         if request.is_unverifiable() and is_third_party(request):
0938             if cookie.version > 0 and self.strict_rfc2965_unverifiable:
0939                 debug("   third-party RFC 2965 cookie during "
0940                              "unverifiable transaction")
0941                 return False
0942             elif cookie.version == 0 and self.strict_ns_unverifiable:
0943                 debug("   third-party Netscape cookie during "
0944                              "unverifiable transaction")
0945                 return False
0946         return True
0947 
0948     def set_ok_name(self, cookie, request):
0949         # Try and stop servers setting V0 cookies designed to hack other
0950         # servers that know both V0 and V1 protocols.
0951         if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
0952             cookie.name.startswith("$")):
0953             debug("   illegal name (starts with '$'): '%s'", cookie.name)
0954             return False
0955         return True
0956 
0957     def set_ok_path(self, cookie, request):
0958         if cookie.path_specified:
0959             req_path = request_path(request)
0960             if ((cookie.version > 0 or
0961                  (cookie.version == 0 and self.strict_ns_set_path)) and
0962                 not req_path.startswith(cookie.path)):
0963                 debug("   path attribute %s is not a prefix of request "
0964                       "path %s", cookie.path, req_path)
0965                 return False
0966         return True
0967 
0968     def set_ok_domain(self, cookie, request):
0969         if self.is_blocked(cookie.domain):
0970             debug("   domain %s is in user block-list", cookie.domain)
0971             return False
0972         if self.is_not_allowed(cookie.domain):
0973             debug("   domain %s is not in user allow-list", cookie.domain)
0974             return False
0975         if cookie.domain_specified:
0976             req_host, erhn = eff_request_host(request)
0977             domain = cookie.domain
0978             if self.strict_domain and (domain.count(".") >= 2):
0979                 i = domain.rfind(".")
0980                 j = domain.rfind(".", 0, i)
0981                 if j == 0:  # domain like .foo.bar
0982                     tld = domain[i+1:]
0983                     sld = domain[j+1:i]
0984                     if (sld.lower() in [
0985                         "co", "ac",
0986                         "com", "edu", "org", "net", "gov", "mil", "int"] and
0987                         len(tld) == 2):
0988                         # domain like .co.uk
0989                         debug("   country-code second level domain %s", domain)
0990                         return False
0991             if domain.startswith("."):
0992                 undotted_domain = domain[1:]
0993             else:
0994                 undotted_domain = domain
0995             embedded_dots = (undotted_domain.find(".") >= 0)
0996             if not embedded_dots and domain != ".local":
0997                 debug("   non-local domain %s contains no embedded dot",
0998                       domain)
0999                 return False
1000             if cookie.version == 0:
1001                 if (not erhn.endswith(domain) and
1002                     (not erhn.startswith(".") and
1003                      not ("."+erhn).endswith(domain))):
1004                     debug("   effective request-host %s (even with added "
1005                           "initial dot) does not end end with %s",
1006                           erhn, domain)
1007                     return False
1008             if (cookie.version > 0 or
1009                 (self.strict_ns_domain & self.DomainRFC2965Match)):
1010                 if not domain_match(erhn, domain):
1011                     debug("   effective request-host %s does not domain-match "
1012                           "%s", erhn, domain)
1013                     return False
1014             if (cookie.version > 0 or
1015                 (self.strict_ns_domain & self.DomainStrictNoDots)):
1016                 host_prefix = req_host[:-len(domain)]
1017                 if (host_prefix.find(".") >= 0 and
1018                     not IPV4_RE.search(req_host)):
1019                     debug("   host prefix %s for domain %s contains a dot",
1020                           host_prefix, domain)
1021                     return False
1022         return True
1023 
1024     def set_ok_port(self, cookie, request):
1025         if cookie.port_specified:
1026             req_port = request_port(request)
1027             if req_port is None:
1028                 req_port = "80"
1029             else:
1030                 req_port = str(req_port)
1031             for p in cookie.port.split(","):
1032                 try:
1033                     int(p)
1034                 except ValueError:
1035                     debug("   bad port %s (not numeric)", p)
1036                     return False
1037                 if p == req_port:
1038                     break
1039             else:
1040                 debug("   request port (%s) not found in %s",
1041                       req_port, cookie.port)
1042                 return False
1043         return True
1044 
1045     def return_ok(self, cookie, request):
1046         """
1047         If you override .return_ok(), be sure to call this method.  If it
1048         returns false, so should your subclass (assuming your subclass wants to
1049         be more strict about which cookies to return).
1050 
1051         """
1052         # Path has already been checked by .path_return_ok(), and domain
1053         # blocking done by .domain_return_ok().
1054         debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1055 
1056         for n in "version", "verifiability", "secure", "expires", "port", "domain":
1057             fn_name = "return_ok_"+n
1058             fn = getattr(self, fn_name)
1059             if not fn(cookie, request):
1060                 return False
1061         return True
1062 
1063     def return_ok_version(self, cookie, request):
1064         if cookie.version > 0 and not self.rfc2965:
1065             debug("   RFC 2965 cookies are switched off")
1066             return False
1067         elif cookie.version == 0 and not self.netscape:
1068             debug("   Netscape cookies are switched off")
1069             return False
1070         return True
1071 
1072     def return_ok_verifiability(self, cookie, request):
1073         if request.is_unverifiable() and is_third_party(request):
1074             if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1075                 debug("   third-party RFC 2965 cookie during unverifiable "
1076                       "transaction")
1077                 return False
1078             elif cookie.version == 0 and self.strict_ns_unverifiable:
1079                 debug("   third-party Netscape cookie during unverifiable "
1080                       "transaction")
1081                 return False
1082         return True
1083 
1084     def return_ok_secure(self, cookie, request):
1085         if cookie.secure and request.get_type() != "https":
1086             debug("   secure cookie with non-secure request")
1087             return False
1088         return True
1089 
1090     def return_ok_expires(self, cookie, request):
1091         if cookie.is_expired(self._now):
1092             debug("   cookie expired")
1093             return False
1094         return True
1095 
1096     def return_ok_port(self, cookie, request):
1097         if cookie.port:
1098             req_port = request_port(request)
1099             if req_port is None:
1100                 req_port = "80"
1101             for p in cookie.port.split(","):
1102                 if p == req_port:
1103                     break
1104             else:
1105                 debug("   request port %s does not match cookie port %s",
1106                       req_port, cookie.port)
1107                 return False
1108         return True
1109 
1110     def return_ok_domain(self, cookie, request):
1111         req_host, erhn = eff_request_host(request)
1112         domain = cookie.domain
1113 
1114         # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1115         if (cookie.version == 0 and
1116             (self.strict_ns_domain & self.DomainStrictNonDomain) and
1117             not cookie.domain_specified and domain != erhn):
1118             debug("   cookie with unspecified domain does not string-compare "
1119                   "equal to request domain")
1120             return False
1121 
1122         if cookie.version > 0 and not domain_match(erhn, domain):
1123             debug("   effective request-host name %s does not domain-match "
1124                   "RFC 2965 cookie domain %s", erhn, domain)
1125             return False
1126         if cookie.version == 0 and not ("."+erhn).endswith(domain):
1127             debug("   request-host %s does not match Netscape cookie domain "
1128                   "%s", req_host, domain)
1129             return False
1130         return True
1131 
1132     def domain_return_ok(self, domain, request):
1133         # Liberal check of.  This is here as an optimization to avoid
1134         # having to load lots of MSIE cookie files unless necessary.
1135         req_host, erhn = eff_request_host(request)
1136         if not req_host.startswith("."):
1137             dotted_req_host = "."+req_host
1138         if not erhn.startswith("."):
1139             dotted_erhn = "."+erhn
1140         if not (dotted_req_host.endswith(domain) or
1141                 dotted_erhn.endswith(domain)):
1142             #debug("   request domain %s does not match cookie domain %s",
1143             #      req_host, domain)
1144             return False
1145 
1146         if self.is_blocked(domain):
1147             debug("   domain %s is in user block-list", domain)
1148             return False
1149         if self.is_not_allowed(domain):
1150             debug("   domain %s is not in user allow-list", domain)
1151             return False
1152 
1153         return True
1154 
1155     def path_return_ok(self, path, request):
1156         debug("- checking cookie path=%s", path)
1157         req_path = request_path(request)
1158         if not req_path.startswith(path):
1159             debug("  %s does not path-match %s", req_path, path)
1160             return False
1161         return True
1162 
1163 
1164 def vals_sorted_by_key(adict):
1165     keys = adict.keys()
1166     keys.sort()
1167     return map(adict.get, keys)
1168 
1169 def deepvalues(mapping):
1170     """Iterates over nested mapping, depth-first, in sorted order by key."""
1171     values = vals_sorted_by_key(mapping)
1172     for obj in values:
1173         mapping = False
1174         try:
1175             obj.items
1176         except AttributeError:
1177             pass
1178         else:
1179             mapping = True
1180             for subobj in deepvalues(obj):
1181                 yield subobj
1182         if not mapping:
1183             yield obj
1184 
1185 
1186 # Used as second parameter to dict.get() method, to distinguish absent
1187 # dict key from one with a None value.
1188 class Absent: pass
1189 
1190 class CookieJar:
1191     """Collection of HTTP cookies.
1192 
1193     You may not need to know about this class: try
1194     urllib2.build_opener(HTTPCookieProcessor).open(url).
1195 
1196     """
1197 
1198     non_word_re = re.compile(r"\W")
1199     quote_re = re.compile(r"([\"\\])")
1200     strict_domain_re = re.compile(r"\.?[^.]*")
1201     domain_re = re.compile(r"[^.]*")
1202     dots_re = re.compile(r"^\.+")
1203 
1204     magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1205 
1206     def __init__(self, policy=None):
1207         if policy is None:
1208             policy = DefaultCookiePolicy()
1209         self._policy = policy
1210 
1211         self._cookies_lock = _threading.RLock()
1212         self._cookies = {}
1213 
1214     def set_policy(self, policy):
1215         self._policy = policy
1216 
1217     def _cookies_for_domain(self, domain, request):
1218         cookies = []
1219         if not self._policy.domain_return_ok(domain, request):
1220             return []
1221         debug("Checking %s for cookies to return", domain)
1222         cookies_by_path = self._cookies[domain]
1223         for path in cookies_by_path.keys():
1224             if not self._policy.path_return_ok(path, request):
1225                 continue
1226             cookies_by_name = cookies_by_path[path]
1227             for cookie in cookies_by_name.values():
1228                 if not self._policy.return_ok(cookie, request):
1229                     debug("   not returning cookie")
1230                     continue
1231                 debug("   it's a match")
1232                 cookies.append(cookie)
1233         return cookies
1234 
1235     def _cookies_for_request(self, request):
1236         """Return a list of cookies to be returned to server."""
1237         cookies = []
1238         for domain in self._cookies.keys():
1239             cookies.extend(self._cookies_for_domain(domain, request))
1240         return cookies
1241 
1242     def _cookie_attrs(self, cookies):
1243         """Return a list of cookie-attributes to be returned to server.
1244 
1245         like ['foo="bar"; $Path="/"', ...]
1246 
1247         The $Version attribute is also added when appropriate (currently only
1248         once per request).
1249 
1250         """
1251         # add cookies in order of most specific (ie. longest) path first
1252         def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1253         cookies.sort(decreasing_size)
1254 
1255         version_set = False
1256 
1257         attrs = []
1258         for cookie in cookies:
1259             # set version of Cookie header
1260             # XXX
1261             # What should it be if multiple matching Set-Cookie headers have
1262             #  different versions themselves?
1263             # Answer: there is no answer; was supposed to be settled by
1264             #  RFC 2965 errata, but that may never appear...
1265             version = cookie.version
1266             if not version_set:
1267                 version_set = True
1268                 if version > 0:
1269                     attrs.append("$Version=%s" % version)
1270 
1271             # quote cookie value if necessary
1272             # (not for Netscape protocol, which already has any quotes
1273             #  intact, due to the poorly-specified Netscape Cookie: syntax)
1274             if ((cookie.value is not None) and
1275                 self.non_word_re.search(cookie.value) and version > 0):
1276                 value = self.quote_re.sub(r"\\\1", cookie.value)
1277             else:
1278                 value = cookie.value
1279 
1280             # add cookie-attributes to be returned in Cookie header
1281             if cookie.value is None:
1282                 attrs.append(cookie.name)
1283             else:
1284                 attrs.append("%s=%s" % (cookie.name, value))
1285             if version > 0:
1286                 if cookie.path_specified:
1287                     attrs.append('$Path="%s"' % cookie.path)
1288                 if cookie.domain.startswith("."):
1289                     domain = cookie.domain
1290                     if (not cookie.domain_initial_dot and
1291                         domain.startswith(".")):
1292                         domain = domain[1:]
1293                     attrs.append('$Domain="%s"' % domain)
1294                 if cookie.port is not None:
1295                     p = "$Port"
1296                     if cookie.port_specified:
1297                         p = p + ('="%s"' % cookie.port)
1298                     attrs.append(p)
1299 
1300         return attrs
1301 
1302     def add_cookie_header(self, request):
1303         """Add correct Cookie: header to request (urllib2.Request object).
1304 
1305         The Cookie2 header is also added unless policy.hide_cookie2 is true.
1306 
1307         """
1308         debug("add_cookie_header")
1309         self._cookies_lock.acquire()
1310 
1311         self._policy._now = self._now = int(time.time())
1312 
1313         req_host, erhn = eff_request_host(request)
1314         strict_non_domain = (
1315             self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain)
1316 
1317         cookies = self._cookies_for_request(request)
1318 
1319         attrs = self._cookie_attrs(cookies)
1320         if attrs:
1321             if not request.has_header("Cookie"):
1322                 request.add_unredirected_header(
1323                     "Cookie", "; ".join(attrs))
1324 
1325         # if necessary, advertise that we know RFC 2965
1326         if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1327             not request.has_header("Cookie2")):
1328             for cookie in cookies:
1329                 if cookie.version != 1:
1330                     request.add_unredirected_header("Cookie2", '$Version="1"')
1331                     break
1332 
1333         self._cookies_lock.release()
1334 
1335         self.clear_expired_cookies()
1336 
1337     def _normalized_cookie_tuples(self, attrs_set):
1338         """Return list of tuples containing normalised cookie information.
1339 
1340         attrs_set is the list of lists of key,value pairs extracted from
1341         the Set-Cookie or Set-Cookie2 headers.
1342 
1343         Tuples are name, value, standard, rest, where name and value are the
1344         cookie name and value, standard is a dictionary containing the standard
1345         cookie-attributes (discard, secure, version, expires or max-age,
1346         domain, path and port) and rest is a dictionary containing the rest of
1347         the cookie-attributes.
1348 
1349         """
1350         cookie_tuples = []
1351 
1352         boolean_attrs = "discard", "secure"
1353         value_attrs = ("version",
1354                        "expires", "max-age",
1355                        "domain", "path", "port",
1356                        "comment", "commenturl")
1357 
1358         for cookie_attrs in attrs_set:
1359             name, value = cookie_attrs[0]
1360 
1361             # Build dictionary of standard cookie-attributes (standard) and
1362             # dictionary of other cookie-attributes (rest).
1363 
1364             # Note: expiry time is normalised to seconds since epoch.  V0
1365             # cookies should have the Expires cookie-attribute, and V1 cookies
1366             # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1367             # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1368             # accept either (but prefer Max-Age).
1369             max_age_set = False
1370 
1371             bad_cookie = False
1372 
1373             standard = {}
1374             rest = {}
1375             for k, v in cookie_attrs[1:]:
1376                 lc = k.lower()
1377                 # don't lose case distinction for unknown fields
1378                 if lc in value_attrs or lc in boolean_attrs:
1379                     k = lc
1380                 if k in boolean_attrs and v is None:
1381                     # boolean cookie-attribute is present, but has no value
1382                     # (like "discard", rather than "port=80")
1383                     v = True
1384                 if k in standard:
1385                     # only first value is significant
1386                     continue
1387                 if k == "domain":
1388                     if v is None:
1389                         debug("   missing value for domain attribute")
1390                         bad_cookie = True
1391                         break
1392                     # RFC 2965 section 3.3.3
1393                     v = v.lower()
1394                 if k == "expires":
1395                     if max_age_set:
1396                         # Prefer max-age to expires (like Mozilla)
1397                         continue
1398                     if v is None:
1399                         debug("   missing or invalid value for expires "
1400                               "attribute: treating as session cookie")
1401                         continue
1402                 if k == "max-age":
1403                     max_age_set = True
1404                     try:
1405                         v = int(v)
1406                     except ValueError:
1407                         debug("   missing or invalid (non-numeric) value for "
1408                               "max-age attribute")
1409                         bad_cookie = True
1410                         break
1411                     # convert RFC 2965 Max-Age to seconds since epoch
1412                     # XXX Strictly you're supposed to follow RFC 2616
1413                     #   age-calculation rules.  Remember that zero Max-Age is a
1414                     #   is a request to discard (old and new) cookie, though.
1415                     k = "expires"
1416                     v = self._now + v
1417                 if (k in value_attrs) or (k in boolean_attrs):
1418                     if (v is None and
1419                         k not in ["port", "comment", "commenturl"]):
1420                         debug("   missing value for %s attribute" % k)
1421                         bad_cookie = True
1422                         break
1423                     standard[k] = v
1424                 else:
1425                     rest[k] = v
1426 
1427             if bad_cookie:
1428                 continue
1429 
1430             cookie_tuples.append((name, value, standard, rest))
1431 
1432         return cookie_tuples
1433 
1434     def _cookie_from_cookie_tuple(self, tup, request):
1435         # standard is dict of standard cookie-attributes, rest is dict of the
1436         # rest of them
1437         name, value, standard, rest = tup
1438 
1439         domain = standard.get("domain", Absent)
1440         path = standard.get("path", Absent)
1441         port = standard.get("port", Absent)
1442         expires = standard.get("expires", Absent)
1443 
1444         # set the easy defaults
1445         version = standard.get("version", None)
1446         if version is not None: version = int(version)
1447         secure = standard.get("secure", False)
1448         # (discard is also set if expires is Absent)
1449         discard = standard.get("discard", False)
1450         comment = standard.get("comment", None)
1451         comment_url = standard.get("commenturl", None)
1452 
1453         # set default path
1454         if path is not Absent and path != "":
1455             path_specified = True
1456             path = escape_path(path)
1457         else:
1458             path_specified = False
1459             path = request_path(request)
1460             i = path.rfind("/")
1461             if i != -1:
1462                 if version == 0:
1463                     # Netscape spec parts company from reality here
1464                     path = path[:i]
1465                 else:
1466                     path = path[:i+1]
1467             if len(path) == 0: path = "/"
1468 
1469         # set default domain
1470         domain_specified = domain is not Absent
1471         # but first we have to remember whether it starts with a dot
1472         domain_initial_dot = False
1473         if domain_specified:
1474             domain_initial_dot = bool(domain.startswith("."))
1475         if domain is Absent:
1476             req_host, erhn = eff_request_host(request)
1477             domain = erhn
1478         elif not domain.startswith("."):
1479             domain = "."+domain
1480 
1481         # set default port
1482         port_specified = False
1483         if port is not Absent:
1484             if port is None:
1485                 # Port attr present, but has no value: default to request port.
1486                 # Cookie should then only be sent back on that port.
1487                 port = request_port(request)
1488             else:
1489                 port_specified = True
1490                 port = re.sub(r"\s+", "", port)
1491         else:
1492             # No port attr present.  Cookie can be sent back on any port.
1493             port = None
1494 
1495         # set default expires and discard
1496         if expires is Absent:
1497             expires = None
1498             discard = True
1499         elif expires <= self._now:
1500             # Expiry date in past is request to delete cookie.  This can't be
1501             # in DefaultCookiePolicy, because can't delete cookies there.
1502             try:
1503                 self.clear(domain, path, name)
1504             except KeyError:
1505                 pass
1506             debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1507                   domain, path, name)
1508             return None
1509 
1510         return Cookie(version,
1511                       name, value,
1512                       port, port_specified,
1513                       domain, domain_specified, domain_initial_dot,
1514                       path, path_specified,
1515                       secure,
1516                       expires,
1517                       discard,
1518                       comment,
1519                       comment_url,
1520                       rest)
1521 
1522     def _cookies_from_attrs_set(self, attrs_set, request):
1523         cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1524 
1525         cookies = []
1526         for tup in cookie_tuples:
1527             cookie = self._cookie_from_cookie_tuple(tup, request)
1528             if cookie: cookies.append(cookie)
1529         return cookies
1530 
1531     def make_cookies(self, response, request):
1532         """Return sequence of Cookie objects extracted from response object."""
1533         # get cookie-attributes for RFC 2965 and Netscape protocols
1534         headers = response.info()
1535         rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1536         ns_hdrs = headers.getheaders("Set-Cookie")
1537 
1538         rfc2965 = self._policy.rfc2965
1539         netscape = self._policy.netscape
1540 
1541         if ((not rfc2965_hdrs and not ns_hdrs) or
1542             (not ns_hdrs and not rfc2965) or
1543             (not rfc2965_hdrs and not netscape) or
1544             (not netscape and not rfc2965)):
1545             return []  # no relevant cookie headers: quick exit
1546 
1547         try:
1548             cookies = self._cookies_from_attrs_set(
1549                 split_header_words(rfc2965_hdrs), request)
1550         except:
1551             reraise_unmasked_exceptions()
1552             cookies = []
1553 
1554         if ns_hdrs and netscape:
1555             try:
1556                 ns_cookies = self._cookies_from_attrs_set(
1557                     parse_ns_headers(ns_hdrs), request)
1558             except:
1559                 reraise_unmasked_exceptions()
1560                 ns_cookies = []
1561 
1562             # Look for Netscape cookies (from Set-Cookie headers) that match
1563             # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1564             # For each match, keep the RFC 2965 cookie and ignore the Netscape
1565             # cookie (RFC 2965 section 9.1).  Actually, RFC 2109 cookies are
1566             # bundled in with the Netscape cookies for this purpose, which is
1567             # reasonable behaviour.
1568             if rfc2965:
1569                 lookup = {}
1570                 for cookie in cookies:
1571                     lookup[(cookie.domain, cookie.path, cookie.name)] = None
1572 
1573                 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1574                     key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1575                     return key not in lookup
1576                 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1577 
1578             if ns_cookies:
1579                 cookies.extend(ns_cookies)
1580 
1581         return cookies
1582 
1583     def set_cookie_if_ok(self, cookie, request):
1584         """Set a cookie if policy says it's OK to do so."""
1585         self._cookies_lock.acquire()
1586         self._policy._now = self._now = int(time.time())
1587 
1588         if self._policy.set_ok(cookie, request):
1589             self.set_cookie(cookie)
1590 
1591         self._cookies_lock.release()
1592 
1593     def set_cookie(self, cookie):
1594         """Set a cookie, without checking whether or not it should be set."""
1595         c = self._cookies
1596         self._cookies_lock.acquire()
1597         try:
1598             if cookie.domain not in c: c[cookie.domain] = {}
1599             c2 = c[cookie.domain]
1600             if cookie.path not in c2: c2[cookie.path] = {}
1601             c3 = c2[cookie.path]
1602             c3[cookie.name] = cookie
1603         finally:
1604             self._cookies_lock.release()
1605 
1606     def extract_cookies(self, response, request):
1607         """Extract cookies from response, where allowable given the request."""
1608         debug("extract_cookies: %s", response.info())
1609         self._cookies_lock.acquire()
1610         self._policy._now = self._now = int(time.time())
1611 
1612         for cookie in self.make_cookies(response, request):
1613             if self._policy.set_ok(cookie, request):
1614                 debug(" setting cookie: %s", cookie)
1615                 self.set_cookie(cookie)
1616         self._cookies_lock.release()
1617 
1618     def clear(self, domain=None, path=None, name=None):
1619         """Clear some cookies.
1620 
1621         Invoking this method without arguments will clear all cookies.  If
1622         given a single argument, only cookies belonging to that domain will be
1623         removed.  If given two arguments, cookies belonging to the specified
1624         path within that domain are removed.  If given three arguments, then
1625         the cookie with the specified name, path and domain is removed.
1626 
1627         Raises KeyError if no matching cookie exists.
1628 
1629         """
1630         if name is not None:
1631             if (domain is None) or (path is None):
1632                 raise ValueError(
1633                     "domain and path must be given to remove a cookie by name")
1634             del self._cookies[domain][path][name]
1635         elif path is not None:
1636             if domain is None:
1637                 raise ValueError(
1638                     "domain must be given to remove cookies by path")
1639             del self._cookies[domain][path]
1640         elif domain is not None:
1641             del self._cookies[domain]
1642         else:
1643             self._cookies = {}
1644 
1645     def clear_session_cookies(self):
1646         """Discard all session cookies.
1647 
1648         Note that the .save() method won't save session cookies anyway, unless
1649         you ask otherwise by passing a true ignore_discard argument.
1650 
1651         """
1652         self._cookies_lock.acquire()
1653         for cookie in self:
1654             if cookie.discard:
1655                 self.clear(cookie.domain, cookie.path, cookie.name)
1656         self._cookies_lock.release()
1657 
1658     def clear_expired_cookies(self):
1659         """Discard all expired cookies.
1660 
1661         You probably don't need to call this method: expired cookies are never
1662         sent back to the server (provided you're using DefaultCookiePolicy),
1663         this method is called by CookieJar itself every so often, and the
1664         .save() method won't save expired cookies anyway (unless you ask
1665         otherwise by passing a true ignore_expires argument).
1666 
1667         """
1668         self._cookies_lock.acquire()
1669         now = time.time()
1670         for cookie in self:
1671             if cookie.is_expired(now):
1672                 self.clear(cookie.domain, cookie.path, cookie.name)
1673         self._cookies_lock.release()
1674 
1675     def __iter__(self):
1676         return deepvalues(self._cookies)
1677 
1678     def __len__(self):
1679         """Return number of contained cookies."""
1680         i = 0
1681         for cookie in self: i = i + 1
1682         return i
1683 
1684     def __repr__(self):
1685         r = []
1686         for cookie in self: r.append(repr(cookie))
1687         return "<%s[%s]>" % (self.__class__, ", ".join(r))
1688 
1689     def __str__(self):
1690         r = []
1691         for cookie in self: r.append(str(cookie))
1692         return "<%s[%s]>" % (self.__class__, ", ".join(r))
1693 
1694 
1695 class LoadError(Exception): pass
1696 
1697 class FileCookieJar(CookieJar):
1698     """CookieJar that can be loaded from and saved to a file."""
1699 
1700     def __init__(self, filename=None, delayload=False, policy=None):
1701         """
1702         Cookies are NOT loaded from the named file until either the .load() or
1703         .revert() method is called.
1704 
1705         """
1706         CookieJar.__init__(self, policy)
1707         if filename is not None:
1708             try:
1709                 filename+""
1710             except:
1711                 raise ValueError("filename must be string-like")
1712         self.filename = filename
1713         self.delayload = bool(delayload)
1714 
1715     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1716         """Save cookies to a file."""
1717         raise NotImplementedError()
1718 
1719     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1720         """Load cookies from a file."""
1721         if filename is None:
1722             if self.filename is not None: filename = self.filename
1723             else: raise ValueError(MISSING_FILENAME_TEXT)
1724 
1725         f = open(filename)
1726         try:
1727             self._really_load(f, filename, ignore_discard, ignore_expires)
1728         finally:
1729             f.close()
1730 
1731     def revert(self, filename=None,
1732                ignore_discard=False, ignore_expires=False):
1733         """Clear all cookies and reload cookies from a saved file.
1734 
1735         Raises LoadError (or IOError) if reversion is not successful; the
1736         object's state will not be altered if this happens.
1737 
1738         """
1739         if filename is None:
1740             if self.filename is not None: filename = self.filename
1741             else: raise ValueError(MISSING_FILENAME_TEXT)
1742 
1743         self._cookies_lock.acquire()
1744 
1745         old_state = copy.deepcopy(self._cookies)
1746         self._cookies = {}
1747         try:
1748             self.load(filename, ignore_discard, ignore_expires)
1749         except (LoadError, IOError):
1750             self._cookies = old_state
1751             raise
1752 
1753         self._cookies_lock.release()
1754 
1755 from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1756 from _MozillaCookieJar import MozillaCookieJar
1757 

Generated by PyXR 0.9.4
SourceForge.net Logo