0001 """HTTP cookie handling for web clients. 0002 0003 This module has (now fairly distant) origins in Gisle Aas' Perl module 0004 HTTP::Cookies, from the libwww-perl library. 0005 0006 Docstrings, comments and debug strings in this code refer to the 0007 attributes of the HTTP cookie system as cookie-attributes, to distinguish 0008 them clearly from Python attributes. 0009 0010 Class diagram (note that the classes which do not derive from 0011 FileCookieJar are not distributed with the Python standard library, but 0012 are available from http://wwwsearch.sf.net/): 0013 0014 CookieJar____ 0015 / \ \ 0016 FileCookieJar \ \ 0017 / | \ \ \ 0018 MozillaCookieJar | LWPCookieJar \ \ 0019 | | \ 0020 | ---MSIEBase | \ 0021 | / | | \ 0022 | / MSIEDBCookieJar BSDDBCookieJar 0023 |/ 0024 MSIECookieJar 0025 0026 """ 0027 0028 import sys, re, urlparse, copy, time, urllib, logging 0029 from types import StringTypes 0030 try: 0031 import threading as _threading 0032 except ImportError: 0033 import dummy_threading as _threading 0034 import httplib # only for the default HTTP port 0035 from calendar import timegm 0036 0037 debug = logging.getLogger("cookielib").debug 0038 0039 DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT) 0040 MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " 0041 "instance initialised with one)") 0042 0043 def reraise_unmasked_exceptions(unmasked=()): 0044 # There are a few catch-all except: statements in this module, for 0045 # catching input that's bad in unexpected ways. 0046 # This function re-raises some exceptions we don't want to trap. 0047 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) 0048 etype = sys.exc_info()[0] 0049 if issubclass(etype, unmasked): 0050 raise 0051 # swallowed an exception 0052 import warnings, traceback, StringIO 0053 f = StringIO.StringIO() 0054 traceback.print_exc(None, f) 0055 msg = f.getvalue() 0056 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2) 0057 0058 0059 # Date/time conversion 0060 # ----------------------------------------------------------------------------- 0061 0062 EPOCH_YEAR = 1970 0063 def _timegm(tt): 0064 year, month, mday, hour, min, sec = tt[:6] 0065 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and 0066 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): 0067 return timegm(tt) 0068 else: 0069 return None 0070 0071 DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] 0072 MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", 0073 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] 0074 MONTHS_LOWER = [] 0075 for month in MONTHS: MONTHS_LOWER.append(month.lower()) 0076 0077 def time2isoz(t=None): 0078 """Return a string representing time in seconds since epoch, t. 0079 0080 If the function is called without an argument, it will use the current 0081 time. 0082 0083 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", 0084 representing Universal Time (UTC, aka GMT). An example of this format is: 0085 0086 1994-11-24 08:49:37Z 0087 0088 """ 0089 if t is None: t = time.time() 0090 year, mon, mday, hour, min, sec = time.gmtime(t)[:6] 0091 return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( 0092 year, mon, mday, hour, min, sec) 0093 0094 def time2netscape(t=None): 0095 """Return a string representing time in seconds since epoch, t. 0096 0097 If the function is called without an argument, it will use the current 0098 time. 0099 0100 The format of the returned string is like this: 0101 0102 Wed, DD-Mon-YYYY HH:MM:SS GMT 0103 0104 """ 0105 if t is None: t = time.time() 0106 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7] 0107 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % ( 0108 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec) 0109 0110 0111 UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} 0112 0113 TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$") 0114 def offset_from_tz_string(tz): 0115 offset = None 0116 if tz in UTC_ZONES: 0117 offset = 0 0118 else: 0119 m = TIMEZONE_RE.search(tz) 0120 if m: 0121 offset = 3600 * int(m.group(2)) 0122 if m.group(3): 0123 offset = offset + 60 * int(m.group(3)) 0124 if m.group(1) == '-': 0125 offset = -offset 0126 return offset 0127 0128 def _str2time(day, mon, yr, hr, min, sec, tz): 0129 # translate month name to number 0130 # month numbers start with 1 (January) 0131 try: 0132 mon = MONTHS_LOWER.index(mon.lower())+1 0133 except ValueError: 0134 # maybe it's already a number 0135 try: 0136 imon = int(mon) 0137 except ValueError: 0138 return None 0139 if 1 <= imon <= 12: 0140 mon = imon 0141 else: 0142 return None 0143 0144 # make sure clock elements are defined 0145 if hr is None: hr = 0 0146 if min is None: min = 0 0147 if sec is None: sec = 0 0148 0149 yr = int(yr) 0150 day = int(day) 0151 hr = int(hr) 0152 min = int(min) 0153 sec = int(sec) 0154 0155 if yr < 1000: 0156 # find "obvious" year 0157 cur_yr = time.localtime(time.time())[0] 0158 m = cur_yr % 100 0159 tmp = yr 0160 yr = yr + cur_yr - m 0161 m = m - tmp 0162 if abs(m) > 50: 0163 if m > 0: yr = yr + 100 0164 else: yr = yr - 100 0165 0166 # convert UTC time tuple to seconds since epoch (not timezone-adjusted) 0167 t = _timegm((yr, mon, day, hr, min, sec, tz)) 0168 0169 if t is not None: 0170 # adjust time using timezone string, to get absolute time since epoch 0171 if tz is None: 0172 tz = "UTC" 0173 tz = tz.upper() 0174 offset = offset_from_tz_string(tz) 0175 if offset is None: 0176 return None 0177 t = t - offset 0178 0179 return t 0180 0181 STRICT_DATE_RE = re.compile( 0182 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " 0183 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") 0184 WEEKDAY_RE = re.compile( 0185 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I) 0186 LOOSE_HTTP_DATE_RE = re.compile( 0187 r"""^ 0188 (\d\d?) # day 0189 (?:\s+|[-\/]) 0190 (\w+) # month 0191 (?:\s+|[-\/]) 0192 (\d+) # year 0193 (?: 0194 (?:\s+|:) # separator before clock 0195 (\d\d?):(\d\d) # hour:min 0196 (?::(\d\d))? # optional seconds 0197 )? # optional clock 0198 \s* 0199 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone 0200 \s* 0201 (?:\(\w+\))? # ASCII representation of timezone in parens. 0202 \s*$""", re.X) 0203 def http2time(text): 0204 """Returns time in seconds since epoch of time represented by a string. 0205 0206 Return value is an integer. 0207 0208 None is returned if the format of str is unrecognized, the time is outside 0209 the representable range, or the timezone string is not recognized. If the 0210 string contains no timezone, UTC is assumed. 0211 0212 The timezone in the string may be numerical (like "-0800" or "+0100") or a 0213 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the 0214 timezone strings equivalent to UTC (zero offset) are known to the function. 0215 0216 The function loosely parses the following formats: 0217 0218 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format 0219 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format 0220 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format 0221 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) 0222 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) 0223 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) 0224 0225 The parser ignores leading and trailing whitespace. The time may be 0226 absent. 0227 0228 If the year is given with only 2 digits, the function will select the 0229 century that makes the year closest to the current date. 0230 0231 """ 0232 # fast exit for strictly conforming string 0233 m = STRICT_DATE_RE.search(text) 0234 if m: 0235 g = m.groups() 0236 mon = MONTHS_LOWER.index(g[1].lower()) + 1 0237 tt = (int(g[2]), mon, int(g[0]), 0238 int(g[3]), int(g[4]), float(g[5])) 0239 return _timegm(tt) 0240 0241 # No, we need some messy parsing... 0242 0243 # clean up 0244 text = text.lstrip() 0245 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday 0246 0247 # tz is time zone specifier string 0248 day, mon, yr, hr, min, sec, tz = [None]*7 0249 0250 # loose regexp parse 0251 m = LOOSE_HTTP_DATE_RE.search(text) 0252 if m is not None: 0253 day, mon, yr, hr, min, sec, tz = m.groups() 0254 else: 0255 return None # bad format 0256 0257 return _str2time(day, mon, yr, hr, min, sec, tz) 0258 0259 ISO_DATE_RE = re.compile( 0260 """^ 0261 (\d{4}) # year 0262 [-\/]? 0263 (\d\d?) # numerical month 0264 [-\/]? 0265 (\d\d?) # day 0266 (?: 0267 (?:\s+|[-:Tt]) # separator before clock 0268 (\d\d?):?(\d\d) # hour:min 0269 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) 0270 )? # optional clock 0271 \s* 0272 ([-+]?\d\d?:?(:?\d\d)? 0273 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) 0274 \s*$""", re.X) 0275 def iso2time(text): 0276 """ 0277 As for http2time, but parses the ISO 8601 formats: 0278 0279 1994-02-03 14:15:29 -0100 -- ISO 8601 format 0280 1994-02-03 14:15:29 -- zone is optional 0281 1994-02-03 -- only date 0282 1994-02-03T14:15:29 -- Use T as separator 0283 19940203T141529Z -- ISO 8601 compact format 0284 19940203 -- only date 0285 0286 """ 0287 # clean up 0288 text = text.lstrip() 0289 0290 # tz is time zone specifier string 0291 day, mon, yr, hr, min, sec, tz = [None]*7 0292 0293 # loose regexp parse 0294 m = ISO_DATE_RE.search(text) 0295 if m is not None: 0296 # XXX there's an extra bit of the timezone I'm ignoring here: is 0297 # this the right thing to do? 0298 yr, mon, day, hr, min, sec, tz, _ = m.groups() 0299 else: 0300 return None # bad format 0301 0302 return _str2time(day, mon, yr, hr, min, sec, tz) 0303 0304 0305 # Header parsing 0306 # ----------------------------------------------------------------------------- 0307 0308 def unmatched(match): 0309 """Return unmatched part of re.Match object.""" 0310 start, end = match.span(0) 0311 return match.string[:start]+match.string[end:] 0312 0313 HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") 0314 HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") 0315 HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") 0316 HEADER_ESCAPE_RE = re.compile(r"\\(.)") 0317 def split_header_words(header_values): 0318 r"""Parse header values into a list of lists containing key,value pairs. 0319 0320 The function knows how to deal with ",", ";" and "=" as well as quoted 0321 values after "=". A list of space separated tokens are parsed as if they 0322 were separated by ";". 0323 0324 If the header_values passed as argument contains multiple values, then they 0325 are treated as if they were a single value separated by comma ",". 0326 0327 This means that this function is useful for parsing header fields that 0328 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax 0329 the requirement for tokens). 0330 0331 headers = #header 0332 header = (token | parameter) *( [";"] (token | parameter)) 0333 0334 token = 1*<any CHAR except CTLs or separators> 0335 separators = "(" | ")" | "<" | ">" | "@" 0336 | "," | ";" | ":" | "\" | <"> 0337 | "/" | "[" | "]" | "?" | "=" 0338 | "{" | "}" | SP | HT 0339 0340 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) 0341 qdtext = <any TEXT except <">> 0342 quoted-pair = "\" CHAR 0343 0344 parameter = attribute "=" value 0345 attribute = token 0346 value = token | quoted-string 0347 0348 Each header is represented by a list of key/value pairs. The value for a 0349 simple token (not part of a parameter) is None. Syntactically incorrect 0350 headers will not necessarily be parsed as you would want. 0351 0352 This is easier to describe with some examples: 0353 0354 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) 0355 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] 0356 >>> split_header_words(['text/html; charset="iso-8859-1"']) 0357 [[('text/html', None), ('charset', 'iso-8859-1')]] 0358 >>> split_header_words([r'Basic realm="\"foo\bar\""']) 0359 [[('Basic', None), ('realm', '"foobar"')]] 0360 0361 """ 0362 assert type(header_values) not in StringTypes 0363 result = [] 0364 for text in header_values: 0365 orig_text = text 0366 pairs = [] 0367 while text: 0368 m = HEADER_TOKEN_RE.search(text) 0369 if m: 0370 text = unmatched(m) 0371 name = m.group(1) 0372 m = HEADER_QUOTED_VALUE_RE.search(text) 0373 if m: # quoted value 0374 text = unmatched(m) 0375 value = m.group(1) 0376 value = HEADER_ESCAPE_RE.sub(r"\1", value) 0377 else: 0378 m = HEADER_VALUE_RE.search(text) 0379 if m: # unquoted value 0380 text = unmatched(m) 0381 value = m.group(1) 0382 value = value.rstrip() 0383 else: 0384 # no value, a lone token 0385 value = None 0386 pairs.append((name, value)) 0387 elif text.lstrip().startswith(","): 0388 # concatenated headers, as per RFC 2616 section 4.2 0389 text = text.lstrip()[1:] 0390 if pairs: result.append(pairs) 0391 pairs = [] 0392 else: 0393 # skip junk 0394 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) 0395 assert nr_junk_chars > 0, ( 0396 "split_header_words bug: '%s', '%s', %s" % 0397 (orig_text, text, pairs)) 0398 text = non_junk 0399 if pairs: result.append(pairs) 0400 return result 0401 0402 HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") 0403 def join_header_words(lists): 0404 """Do the inverse (almost) of the conversion done by split_header_words. 0405 0406 Takes a list of lists of (key, value) pairs and produces a single header 0407 value. Attribute values are quoted if needed. 0408 0409 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) 0410 'text/plain; charset="iso-8859/1"' 0411 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) 0412 'text/plain, charset="iso-8859/1"' 0413 0414 """ 0415 headers = [] 0416 for pairs in lists: 0417 attr = [] 0418 for k, v in pairs: 0419 if v is not None: 0420 if not re.search(r"^\w+$", v): 0421 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ 0422 v = '"%s"' % v 0423 k = "%s=%s" % (k, v) 0424 attr.append(k) 0425 if attr: headers.append("; ".join(attr)) 0426 return ", ".join(headers) 0427 0428 def parse_ns_headers(ns_headers): 0429 """Ad-hoc parser for Netscape protocol cookie-attributes. 0430 0431 The old Netscape cookie format for Set-Cookie can for instance contain 0432 an unquoted "," in the expires field, so we have to use this ad-hoc 0433 parser instead of split_header_words. 0434 0435 XXX This may not make the best possible effort to parse all the crap 0436 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient 0437 parser is probably better, so could do worse than following that if 0438 this ever gives any trouble. 0439 0440 Currently, this is also used for parsing RFC 2109 cookies. 0441 0442 """ 0443 known_attrs = ("expires", "domain", "path", "secure", 0444 # RFC 2109 attrs (may turn up in Netscape cookies, too) 0445 "port", "max-age") 0446 0447 result = [] 0448 for ns_header in ns_headers: 0449 pairs = [] 0450 version_set = False 0451 for param in re.split(r";\s*", ns_header): 0452 param = param.rstrip() 0453 if param == "": continue 0454 if "=" not in param: 0455 if param.lower() in known_attrs: 0456 k, v = param, None 0457 else: 0458 # cookie with missing value 0459 k, v = param, None 0460 else: 0461 k, v = re.split(r"\s*=\s*", param, 1) 0462 k = k.lstrip() 0463 if k is not None: 0464 lc = k.lower() 0465 if lc in known_attrs: 0466 k = lc 0467 if k == "version": 0468 # This is an RFC 2109 cookie. Will be treated as RFC 2965 0469 # cookie in rest of code. 0470 # Probably it should be parsed with split_header_words, but 0471 # that's too much hassle. 0472 version_set = True 0473 if k == "expires": 0474 # convert expires date to seconds since epoch 0475 if v.startswith('"'): v = v[1:] 0476 if v.endswith('"'): v = v[:-1] 0477 v = http2time(v) # None if invalid 0478 pairs.append((k, v)) 0479 0480 if pairs: 0481 if not version_set: 0482 pairs.append(("version", "0")) 0483 result.append(pairs) 0484 0485 return result 0486 0487 0488 IPV4_RE = re.compile(r"\.\d+$") 0489 def is_HDN(text): 0490 """Return True if text is a host domain name.""" 0491 # XXX 0492 # This may well be wrong. Which RFC is HDN defined in, if any (for 0493 # the purposes of RFC 2965)? 0494 # For the current implementation, what about IPv6? Remember to look 0495 # at other uses of IPV4_RE also, if change this. 0496 if IPV4_RE.search(text): 0497 return False 0498 if text == "": 0499 return False 0500 if text[0] == "." or text[-1] == ".": 0501 return False 0502 return True 0503 0504 def domain_match(A, B): 0505 """Return True if domain A domain-matches domain B, according to RFC 2965. 0506 0507 A and B may be host domain names or IP addresses. 0508 0509 RFC 2965, section 1: 0510 0511 Host names can be specified either as an IP address or a HDN string. 0512 Sometimes we compare one host name with another. (Such comparisons SHALL 0513 be case-insensitive.) Host A's name domain-matches host B's if 0514 0515 * their host name strings string-compare equal; or 0516 0517 * A is a HDN string and has the form NB, where N is a non-empty 0518 name string, B has the form .B', and B' is a HDN string. (So, 0519 x.y.com domain-matches .Y.com but not Y.com.) 0520 0521 Note that domain-match is not a commutative operation: a.b.c.com 0522 domain-matches .c.com, but not the reverse. 0523 0524 """ 0525 # Note that, if A or B are IP addresses, the only relevant part of the 0526 # definition of the domain-match algorithm is the direct string-compare. 0527 A = A.lower() 0528 B = B.lower() 0529 if A == B: 0530 return True 0531 if not is_HDN(A): 0532 return False 0533 i = A.rfind(B) 0534 if i == -1 or i == 0: 0535 # A does not have form NB, or N is the empty string 0536 return False 0537 if not B.startswith("."): 0538 return False 0539 if not is_HDN(B[1:]): 0540 return False 0541 return True 0542 0543 def liberal_is_HDN(text): 0544 """Return True if text is a sort-of-like a host domain name. 0545 0546 For accepting/blocking domains. 0547 0548 """ 0549 if IPV4_RE.search(text): 0550 return False 0551 return True 0552 0553 def user_domain_match(A, B): 0554 """For blocking/accepting domains. 0555 0556 A and B may be host domain names or IP addresses. 0557 0558 """ 0559 A = A.lower() 0560 B = B.lower() 0561 if not (liberal_is_HDN(A) and liberal_is_HDN(B)): 0562 if A == B: 0563 # equal IP addresses 0564 return True 0565 return False 0566 initial_dot = B.startswith(".") 0567 if initial_dot and A.endswith(B): 0568 return True 0569 if not initial_dot and A == B: 0570 return True 0571 return False 0572 0573 cut_port_re = re.compile(r":\d+$") 0574 def request_host(request): 0575 """Return request-host, as defined by RFC 2965. 0576 0577 Variation from RFC: returned value is lowercased, for convenient 0578 comparison. 0579 0580 """ 0581 url = request.get_full_url() 0582 host = urlparse.urlparse(url)[1] 0583 if host == "": 0584 host = request.get_header("Host", "") 0585 0586 # remove port, if present 0587 host = cut_port_re.sub("", host, 1) 0588 return host.lower() 0589 0590 def eff_request_host(request): 0591 """Return a tuple (request-host, effective request-host name). 0592 0593 As defined by RFC 2965, except both are lowercased. 0594 0595 """ 0596 erhn = req_host = request_host(request) 0597 if req_host.find(".") == -1 and not IPV4_RE.search(req_host): 0598 erhn = req_host + ".local" 0599 return req_host, erhn 0600 0601 def request_path(request): 0602 """request-URI, as defined by RFC 2965.""" 0603 url = request.get_full_url() 0604 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url) 0605 #req_path = escape_path("".join(urlparse.urlparse(url)[2:])) 0606 path, parameters, query, frag = urlparse.urlparse(url)[2:] 0607 if parameters: 0608 path = "%s;%s" % (path, parameters) 0609 path = escape_path(path) 0610 req_path = urlparse.urlunparse(("", "", path, "", query, frag)) 0611 if not req_path.startswith("/"): 0612 # fix bad RFC 2396 absoluteURI 0613 req_path = "/"+req_path 0614 return req_path 0615 0616 def request_port(request): 0617 host = request.get_host() 0618 i = host.find(':') 0619 if i >= 0: 0620 port = host[i+1:] 0621 try: 0622 int(port) 0623 except ValueError: 0624 debug("nonnumeric port: '%s'", port) 0625 return None 0626 else: 0627 port = DEFAULT_HTTP_PORT 0628 return port 0629 0630 # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't 0631 # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). 0632 HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" 0633 ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") 0634 def uppercase_escaped_char(match): 0635 return "%%%s" % match.group(1).upper() 0636 def escape_path(path): 0637 """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" 0638 # There's no knowing what character encoding was used to create URLs 0639 # containing %-escapes, but since we have to pick one to escape invalid 0640 # path characters, we pick UTF-8, as recommended in the HTML 4.0 0641 # specification: 0642 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 0643 # And here, kind of: draft-fielding-uri-rfc2396bis-03 0644 # (And in draft IRI specification: draft-duerst-iri-05) 0645 # (And here, for new URI schemes: RFC 2718) 0646 if isinstance(path, unicode): 0647 path = path.encode("utf-8") 0648 path = urllib.quote(path, HTTP_PATH_SAFE) 0649 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) 0650 return path 0651 0652 def reach(h): 0653 """Return reach of host h, as defined by RFC 2965, section 1. 0654 0655 The reach R of a host name H is defined as follows: 0656 0657 * If 0658 0659 - H is the host domain name of a host; and, 0660 0661 - H has the form A.B; and 0662 0663 - A has no embedded (that is, interior) dots; and 0664 0665 - B has at least one embedded dot, or B is the string "local". 0666 then the reach of H is .B. 0667 0668 * Otherwise, the reach of H is H. 0669 0670 >>> reach("www.acme.com") 0671 '.acme.com' 0672 >>> reach("acme.com") 0673 'acme.com' 0674 >>> reach("acme.local") 0675 '.local' 0676 0677 """ 0678 i = h.find(".") 0679 if i >= 0: 0680 #a = h[:i] # this line is only here to show what a is 0681 b = h[i+1:] 0682 i = b.find(".") 0683 if is_HDN(h) and (i >= 0 or b == "local"): 0684 return "."+b 0685 return h 0686 0687 def is_third_party(request): 0688 """ 0689 0690 RFC 2965, section 3.3.6: 0691 0692 An unverifiable transaction is to a third-party host if its request- 0693 host U does not domain-match the reach R of the request-host O in the 0694 origin transaction. 0695 0696 """ 0697 req_host = request_host(request) 0698 if not domain_match(req_host, reach(request.get_origin_req_host())): 0699 return True 0700 else: 0701 return False 0702 0703 0704 class Cookie: 0705 """HTTP Cookie. 0706 0707 This class represents both Netscape and RFC 2965 cookies. 0708 0709 This is deliberately a very simple class. It just holds attributes. It's 0710 possible to construct Cookie instances that don't comply with the cookie 0711 standards. CookieJar.make_cookies is the factory function for Cookie 0712 objects -- it deals with cookie parsing, supplying defaults, and 0713 normalising to the representation used in this class. CookiePolicy is 0714 responsible for checking them to see whether they should be accepted from 0715 and returned to the server. 0716 0717 Note that the port may be present in the headers, but unspecified ("Port" 0718 rather than"Port=80", for example); if this is the case, port is None. 0719 0720 """ 0721 0722 def __init__(self, version, name, value, 0723 port, port_specified, 0724 domain, domain_specified, domain_initial_dot, 0725 path, path_specified, 0726 secure, 0727 expires, 0728 discard, 0729 comment, 0730 comment_url, 0731 rest): 0732 0733 if version is not None: version = int(version) 0734 if expires is not None: expires = int(expires) 0735 if port is None and port_specified is True: 0736 raise ValueError("if port is None, port_specified must be false") 0737 0738 self.version = version 0739 self.name = name 0740 self.value = value 0741 self.port = port 0742 self.port_specified = port_specified 0743 # normalise case, as per RFC 2965 section 3.3.3 0744 self.domain = domain.lower() 0745 self.domain_specified = domain_specified 0746 # Sigh. We need to know whether the domain given in the 0747 # cookie-attribute had an initial dot, in order to follow RFC 2965 0748 # (as clarified in draft errata). Needed for the returned $Domain 0749 # value. 0750 self.domain_initial_dot = domain_initial_dot 0751 self.path = path 0752 self.path_specified = path_specified 0753 self.secure = secure 0754 self.expires = expires 0755 self.discard = discard 0756 self.comment = comment 0757 self.comment_url = comment_url 0758 0759 self._rest = copy.copy(rest) 0760 0761 def has_nonstandard_attr(self, name): 0762 return name in self._rest 0763 def get_nonstandard_attr(self, name, default=None): 0764 return self._rest.get(name, default) 0765 def set_nonstandard_attr(self, name, value): 0766 self._rest[name] = value 0767 0768 def is_expired(self, now=None): 0769 if now is None: now = time.time() 0770 if (self.expires is not None) and (self.expires <= now): 0771 return True 0772 return False 0773 0774 def __str__(self): 0775 if self.port is None: p = "" 0776 else: p = ":"+self.port 0777 limit = self.domain + p + self.path 0778 if self.value is not None: 0779 namevalue = "%s=%s" % (self.name, self.value) 0780 else: 0781 namevalue = self.name 0782 return "<Cookie %s for %s>" % (namevalue, limit) 0783 0784 def __repr__(self): 0785 args = [] 0786 for name in ["version", "name", "value", 0787 "port", "port_specified", 0788 "domain", "domain_specified", "domain_initial_dot", 0789 "path", "path_specified", 0790 "secure", "expires", "discard", "comment", "comment_url", 0791 ]: 0792 attr = getattr(self, name) 0793 args.append("%s=%s" % (name, repr(attr))) 0794 args.append("rest=%s" % repr(self._rest)) 0795 return "Cookie(%s)" % ", ".join(args) 0796 0797 0798 class CookiePolicy: 0799 """Defines which cookies get accepted from and returned to server. 0800 0801 May also modify cookies, though this is probably a bad idea. 0802 0803 The subclass DefaultCookiePolicy defines the standard rules for Netscape 0804 and RFC 2965 cookies -- override that if you want a customised policy. 0805 0806 """ 0807 def set_ok(self, cookie, request): 0808 """Return true if (and only if) cookie should be accepted from server. 0809 0810 Currently, pre-expired cookies never get this far -- the CookieJar 0811 class deletes such cookies itself. 0812 0813 """ 0814 raise NotImplementedError() 0815 0816 def return_ok(self, cookie, request): 0817 """Return true if (and only if) cookie should be returned to server.""" 0818 raise NotImplementedError() 0819 0820 def domain_return_ok(self, domain, request): 0821 """Return false if cookies should not be returned, given cookie domain. 0822 """ 0823 return True 0824 0825 def path_return_ok(self, path, request): 0826 """Return false if cookies should not be returned, given cookie path. 0827 """ 0828 return True 0829 0830 0831 class DefaultCookiePolicy(CookiePolicy): 0832 """Implements the standard rules for accepting and returning cookies.""" 0833 0834 DomainStrictNoDots = 1 0835 DomainStrictNonDomain = 2 0836 DomainRFC2965Match = 4 0837 0838 DomainLiberal = 0 0839 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain 0840 0841 def __init__(self, 0842 blocked_domains=None, allowed_domains=None, 0843 netscape=True, rfc2965=False, 0844 hide_cookie2=False, 0845 strict_domain=False, 0846 strict_rfc2965_unverifiable=True, 0847 strict_ns_unverifiable=False, 0848 strict_ns_domain=DomainLiberal, 0849 strict_ns_set_initial_dollar=False, 0850 strict_ns_set_path=False, 0851 ): 0852 """Constructor arguments should be passed as keyword arguments only.""" 0853 self.netscape = netscape 0854 self.rfc2965 = rfc2965 0855 self.hide_cookie2 = hide_cookie2 0856 self.strict_domain = strict_domain 0857 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable 0858 self.strict_ns_unverifiable = strict_ns_unverifiable 0859 self.strict_ns_domain = strict_ns_domain 0860 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar 0861 self.strict_ns_set_path = strict_ns_set_path 0862 0863 if blocked_domains is not None: 0864 self._blocked_domains = tuple(blocked_domains) 0865 else: 0866 self._blocked_domains = () 0867 0868 if allowed_domains is not None: 0869 allowed_domains = tuple(allowed_domains) 0870 self._allowed_domains = allowed_domains 0871 0872 def blocked_domains(self): 0873 """Return the sequence of blocked domains (as a tuple).""" 0874 return self._blocked_domains 0875 def set_blocked_domains(self, blocked_domains): 0876 """Set the sequence of blocked domains.""" 0877 self._blocked_domains = tuple(blocked_domains) 0878 0879 def is_blocked(self, domain): 0880 for blocked_domain in self._blocked_domains: 0881 if user_domain_match(domain, blocked_domain): 0882 return True 0883 return False 0884 0885 def allowed_domains(self): 0886 """Return None, or the sequence of allowed domains (as a tuple).""" 0887 return self._allowed_domains 0888 def set_allowed_domains(self, allowed_domains): 0889 """Set the sequence of allowed domains, or None.""" 0890 if allowed_domains is not None: 0891 allowed_domains = tuple(allowed_domains) 0892 self._allowed_domains = allowed_domains 0893 0894 def is_not_allowed(self, domain): 0895 if self._allowed_domains is None: 0896 return False 0897 for allowed_domain in self._allowed_domains: 0898 if user_domain_match(domain, allowed_domain): 0899 return False 0900 return True 0901 0902 def set_ok(self, cookie, request): 0903 """ 0904 If you override .set_ok(), be sure to call this method. If it returns 0905 false, so should your subclass (assuming your subclass wants to be more 0906 strict about which cookies to accept). 0907 0908 """ 0909 debug(" - checking cookie %s=%s", cookie.name, cookie.value) 0910 0911 assert cookie.name is not None 0912 0913 for n in "version", "verifiability", "name", "path", "domain", "port": 0914 fn_name = "set_ok_"+n 0915 fn = getattr(self, fn_name) 0916 if not fn(cookie, request): 0917 return False 0918 0919 return True 0920 0921 def set_ok_version(self, cookie, request): 0922 if cookie.version is None: 0923 # Version is always set to 0 by parse_ns_headers if it's a Netscape 0924 # cookie, so this must be an invalid RFC 2965 cookie. 0925 debug(" Set-Cookie2 without version attribute (%s=%s)", 0926 cookie.name, cookie.value) 0927 return False 0928 if cookie.version > 0 and not self.rfc2965: 0929 debug(" RFC 2965 cookies are switched off") 0930 return False 0931 elif cookie.version == 0 and not self.netscape: 0932 debug(" Netscape cookies are switched off") 0933 return False 0934 return True 0935 0936 def set_ok_verifiability(self, cookie, request): 0937 if request.is_unverifiable() and is_third_party(request): 0938 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 0939 debug(" third-party RFC 2965 cookie during " 0940 "unverifiable transaction") 0941 return False 0942 elif cookie.version == 0 and self.strict_ns_unverifiable: 0943 debug(" third-party Netscape cookie during " 0944 "unverifiable transaction") 0945 return False 0946 return True 0947 0948 def set_ok_name(self, cookie, request): 0949 # Try and stop servers setting V0 cookies designed to hack other 0950 # servers that know both V0 and V1 protocols. 0951 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and 0952 cookie.name.startswith("$")): 0953 debug(" illegal name (starts with '$'): '%s'", cookie.name) 0954 return False 0955 return True 0956 0957 def set_ok_path(self, cookie, request): 0958 if cookie.path_specified: 0959 req_path = request_path(request) 0960 if ((cookie.version > 0 or 0961 (cookie.version == 0 and self.strict_ns_set_path)) and 0962 not req_path.startswith(cookie.path)): 0963 debug(" path attribute %s is not a prefix of request " 0964 "path %s", cookie.path, req_path) 0965 return False 0966 return True 0967 0968 def set_ok_domain(self, cookie, request): 0969 if self.is_blocked(cookie.domain): 0970 debug(" domain %s is in user block-list", cookie.domain) 0971 return False 0972 if self.is_not_allowed(cookie.domain): 0973 debug(" domain %s is not in user allow-list", cookie.domain) 0974 return False 0975 if cookie.domain_specified: 0976 req_host, erhn = eff_request_host(request) 0977 domain = cookie.domain 0978 if self.strict_domain and (domain.count(".") >= 2): 0979 i = domain.rfind(".") 0980 j = domain.rfind(".", 0, i) 0981 if j == 0: # domain like .foo.bar 0982 tld = domain[i+1:] 0983 sld = domain[j+1:i] 0984 if (sld.lower() in [ 0985 "co", "ac", 0986 "com", "edu", "org", "net", "gov", "mil", "int"] and 0987 len(tld) == 2): 0988 # domain like .co.uk 0989 debug(" country-code second level domain %s", domain) 0990 return False 0991 if domain.startswith("."): 0992 undotted_domain = domain[1:] 0993 else: 0994 undotted_domain = domain 0995 embedded_dots = (undotted_domain.find(".") >= 0) 0996 if not embedded_dots and domain != ".local": 0997 debug(" non-local domain %s contains no embedded dot", 0998 domain) 0999 return False 1000 if cookie.version == 0: 1001 if (not erhn.endswith(domain) and 1002 (not erhn.startswith(".") and 1003 not ("."+erhn).endswith(domain))): 1004 debug(" effective request-host %s (even with added " 1005 "initial dot) does not end end with %s", 1006 erhn, domain) 1007 return False 1008 if (cookie.version > 0 or 1009 (self.strict_ns_domain & self.DomainRFC2965Match)): 1010 if not domain_match(erhn, domain): 1011 debug(" effective request-host %s does not domain-match " 1012 "%s", erhn, domain) 1013 return False 1014 if (cookie.version > 0 or 1015 (self.strict_ns_domain & self.DomainStrictNoDots)): 1016 host_prefix = req_host[:-len(domain)] 1017 if (host_prefix.find(".") >= 0 and 1018 not IPV4_RE.search(req_host)): 1019 debug(" host prefix %s for domain %s contains a dot", 1020 host_prefix, domain) 1021 return False 1022 return True 1023 1024 def set_ok_port(self, cookie, request): 1025 if cookie.port_specified: 1026 req_port = request_port(request) 1027 if req_port is None: 1028 req_port = "80" 1029 else: 1030 req_port = str(req_port) 1031 for p in cookie.port.split(","): 1032 try: 1033 int(p) 1034 except ValueError: 1035 debug(" bad port %s (not numeric)", p) 1036 return False 1037 if p == req_port: 1038 break 1039 else: 1040 debug(" request port (%s) not found in %s", 1041 req_port, cookie.port) 1042 return False 1043 return True 1044 1045 def return_ok(self, cookie, request): 1046 """ 1047 If you override .return_ok(), be sure to call this method. If it 1048 returns false, so should your subclass (assuming your subclass wants to 1049 be more strict about which cookies to return). 1050 1051 """ 1052 # Path has already been checked by .path_return_ok(), and domain 1053 # blocking done by .domain_return_ok(). 1054 debug(" - checking cookie %s=%s", cookie.name, cookie.value) 1055 1056 for n in "version", "verifiability", "secure", "expires", "port", "domain": 1057 fn_name = "return_ok_"+n 1058 fn = getattr(self, fn_name) 1059 if not fn(cookie, request): 1060 return False 1061 return True 1062 1063 def return_ok_version(self, cookie, request): 1064 if cookie.version > 0 and not self.rfc2965: 1065 debug(" RFC 2965 cookies are switched off") 1066 return False 1067 elif cookie.version == 0 and not self.netscape: 1068 debug(" Netscape cookies are switched off") 1069 return False 1070 return True 1071 1072 def return_ok_verifiability(self, cookie, request): 1073 if request.is_unverifiable() and is_third_party(request): 1074 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 1075 debug(" third-party RFC 2965 cookie during unverifiable " 1076 "transaction") 1077 return False 1078 elif cookie.version == 0 and self.strict_ns_unverifiable: 1079 debug(" third-party Netscape cookie during unverifiable " 1080 "transaction") 1081 return False 1082 return True 1083 1084 def return_ok_secure(self, cookie, request): 1085 if cookie.secure and request.get_type() != "https": 1086 debug(" secure cookie with non-secure request") 1087 return False 1088 return True 1089 1090 def return_ok_expires(self, cookie, request): 1091 if cookie.is_expired(self._now): 1092 debug(" cookie expired") 1093 return False 1094 return True 1095 1096 def return_ok_port(self, cookie, request): 1097 if cookie.port: 1098 req_port = request_port(request) 1099 if req_port is None: 1100 req_port = "80" 1101 for p in cookie.port.split(","): 1102 if p == req_port: 1103 break 1104 else: 1105 debug(" request port %s does not match cookie port %s", 1106 req_port, cookie.port) 1107 return False 1108 return True 1109 1110 def return_ok_domain(self, cookie, request): 1111 req_host, erhn = eff_request_host(request) 1112 domain = cookie.domain 1113 1114 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't 1115 if (cookie.version == 0 and 1116 (self.strict_ns_domain & self.DomainStrictNonDomain) and 1117 not cookie.domain_specified and domain != erhn): 1118 debug(" cookie with unspecified domain does not string-compare " 1119 "equal to request domain") 1120 return False 1121 1122 if cookie.version > 0 and not domain_match(erhn, domain): 1123 debug(" effective request-host name %s does not domain-match " 1124 "RFC 2965 cookie domain %s", erhn, domain) 1125 return False 1126 if cookie.version == 0 and not ("."+erhn).endswith(domain): 1127 debug(" request-host %s does not match Netscape cookie domain " 1128 "%s", req_host, domain) 1129 return False 1130 return True 1131 1132 def domain_return_ok(self, domain, request): 1133 # Liberal check of. This is here as an optimization to avoid 1134 # having to load lots of MSIE cookie files unless necessary. 1135 req_host, erhn = eff_request_host(request) 1136 if not req_host.startswith("."): 1137 dotted_req_host = "."+req_host 1138 if not erhn.startswith("."): 1139 dotted_erhn = "."+erhn 1140 if not (dotted_req_host.endswith(domain) or 1141 dotted_erhn.endswith(domain)): 1142 #debug(" request domain %s does not match cookie domain %s", 1143 # req_host, domain) 1144 return False 1145 1146 if self.is_blocked(domain): 1147 debug(" domain %s is in user block-list", domain) 1148 return False 1149 if self.is_not_allowed(domain): 1150 debug(" domain %s is not in user allow-list", domain) 1151 return False 1152 1153 return True 1154 1155 def path_return_ok(self, path, request): 1156 debug("- checking cookie path=%s", path) 1157 req_path = request_path(request) 1158 if not req_path.startswith(path): 1159 debug(" %s does not path-match %s", req_path, path) 1160 return False 1161 return True 1162 1163 1164 def vals_sorted_by_key(adict): 1165 keys = adict.keys() 1166 keys.sort() 1167 return map(adict.get, keys) 1168 1169 def deepvalues(mapping): 1170 """Iterates over nested mapping, depth-first, in sorted order by key.""" 1171 values = vals_sorted_by_key(mapping) 1172 for obj in values: 1173 mapping = False 1174 try: 1175 obj.items 1176 except AttributeError: 1177 pass 1178 else: 1179 mapping = True 1180 for subobj in deepvalues(obj): 1181 yield subobj 1182 if not mapping: 1183 yield obj 1184 1185 1186 # Used as second parameter to dict.get() method, to distinguish absent 1187 # dict key from one with a None value. 1188 class Absent: pass 1189 1190 class CookieJar: 1191 """Collection of HTTP cookies. 1192 1193 You may not need to know about this class: try 1194 urllib2.build_opener(HTTPCookieProcessor).open(url). 1195 1196 """ 1197 1198 non_word_re = re.compile(r"\W") 1199 quote_re = re.compile(r"([\"\\])") 1200 strict_domain_re = re.compile(r"\.?[^.]*") 1201 domain_re = re.compile(r"[^.]*") 1202 dots_re = re.compile(r"^\.+") 1203 1204 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" 1205 1206 def __init__(self, policy=None): 1207 if policy is None: 1208 policy = DefaultCookiePolicy() 1209 self._policy = policy 1210 1211 self._cookies_lock = _threading.RLock() 1212 self._cookies = {} 1213 1214 def set_policy(self, policy): 1215 self._policy = policy 1216 1217 def _cookies_for_domain(self, domain, request): 1218 cookies = [] 1219 if not self._policy.domain_return_ok(domain, request): 1220 return [] 1221 debug("Checking %s for cookies to return", domain) 1222 cookies_by_path = self._cookies[domain] 1223 for path in cookies_by_path.keys(): 1224 if not self._policy.path_return_ok(path, request): 1225 continue 1226 cookies_by_name = cookies_by_path[path] 1227 for cookie in cookies_by_name.values(): 1228 if not self._policy.return_ok(cookie, request): 1229 debug(" not returning cookie") 1230 continue 1231 debug(" it's a match") 1232 cookies.append(cookie) 1233 return cookies 1234 1235 def _cookies_for_request(self, request): 1236 """Return a list of cookies to be returned to server.""" 1237 cookies = [] 1238 for domain in self._cookies.keys(): 1239 cookies.extend(self._cookies_for_domain(domain, request)) 1240 return cookies 1241 1242 def _cookie_attrs(self, cookies): 1243 """Return a list of cookie-attributes to be returned to server. 1244 1245 like ['foo="bar"; $Path="/"', ...] 1246 1247 The $Version attribute is also added when appropriate (currently only 1248 once per request). 1249 1250 """ 1251 # add cookies in order of most specific (ie. longest) path first 1252 def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) 1253 cookies.sort(decreasing_size) 1254 1255 version_set = False 1256 1257 attrs = [] 1258 for cookie in cookies: 1259 # set version of Cookie header 1260 # XXX 1261 # What should it be if multiple matching Set-Cookie headers have 1262 # different versions themselves? 1263 # Answer: there is no answer; was supposed to be settled by 1264 # RFC 2965 errata, but that may never appear... 1265 version = cookie.version 1266 if not version_set: 1267 version_set = True 1268 if version > 0: 1269 attrs.append("$Version=%s" % version) 1270 1271 # quote cookie value if necessary 1272 # (not for Netscape protocol, which already has any quotes 1273 # intact, due to the poorly-specified Netscape Cookie: syntax) 1274 if ((cookie.value is not None) and 1275 self.non_word_re.search(cookie.value) and version > 0): 1276 value = self.quote_re.sub(r"\\\1", cookie.value) 1277 else: 1278 value = cookie.value 1279 1280 # add cookie-attributes to be returned in Cookie header 1281 if cookie.value is None: 1282 attrs.append(cookie.name) 1283 else: 1284 attrs.append("%s=%s" % (cookie.name, value)) 1285 if version > 0: 1286 if cookie.path_specified: 1287 attrs.append('$Path="%s"' % cookie.path) 1288 if cookie.domain.startswith("."): 1289 domain = cookie.domain 1290 if (not cookie.domain_initial_dot and 1291 domain.startswith(".")): 1292 domain = domain[1:] 1293 attrs.append('$Domain="%s"' % domain) 1294 if cookie.port is not None: 1295 p = "$Port" 1296 if cookie.port_specified: 1297 p = p + ('="%s"' % cookie.port) 1298 attrs.append(p) 1299 1300 return attrs 1301 1302 def add_cookie_header(self, request): 1303 """Add correct Cookie: header to request (urllib2.Request object). 1304 1305 The Cookie2 header is also added unless policy.hide_cookie2 is true. 1306 1307 """ 1308 debug("add_cookie_header") 1309 self._cookies_lock.acquire() 1310 1311 self._policy._now = self._now = int(time.time()) 1312 1313 req_host, erhn = eff_request_host(request) 1314 strict_non_domain = ( 1315 self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain) 1316 1317 cookies = self._cookies_for_request(request) 1318 1319 attrs = self._cookie_attrs(cookies) 1320 if attrs: 1321 if not request.has_header("Cookie"): 1322 request.add_unredirected_header( 1323 "Cookie", "; ".join(attrs)) 1324 1325 # if necessary, advertise that we know RFC 2965 1326 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and 1327 not request.has_header("Cookie2")): 1328 for cookie in cookies: 1329 if cookie.version != 1: 1330 request.add_unredirected_header("Cookie2", '$Version="1"') 1331 break 1332 1333 self._cookies_lock.release() 1334 1335 self.clear_expired_cookies() 1336 1337 def _normalized_cookie_tuples(self, attrs_set): 1338 """Return list of tuples containing normalised cookie information. 1339 1340 attrs_set is the list of lists of key,value pairs extracted from 1341 the Set-Cookie or Set-Cookie2 headers. 1342 1343 Tuples are name, value, standard, rest, where name and value are the 1344 cookie name and value, standard is a dictionary containing the standard 1345 cookie-attributes (discard, secure, version, expires or max-age, 1346 domain, path and port) and rest is a dictionary containing the rest of 1347 the cookie-attributes. 1348 1349 """ 1350 cookie_tuples = [] 1351 1352 boolean_attrs = "discard", "secure" 1353 value_attrs = ("version", 1354 "expires", "max-age", 1355 "domain", "path", "port", 1356 "comment", "commenturl") 1357 1358 for cookie_attrs in attrs_set: 1359 name, value = cookie_attrs[0] 1360 1361 # Build dictionary of standard cookie-attributes (standard) and 1362 # dictionary of other cookie-attributes (rest). 1363 1364 # Note: expiry time is normalised to seconds since epoch. V0 1365 # cookies should have the Expires cookie-attribute, and V1 cookies 1366 # should have Max-Age, but since V1 includes RFC 2109 cookies (and 1367 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we 1368 # accept either (but prefer Max-Age). 1369 max_age_set = False 1370 1371 bad_cookie = False 1372 1373 standard = {} 1374 rest = {} 1375 for k, v in cookie_attrs[1:]: 1376 lc = k.lower() 1377 # don't lose case distinction for unknown fields 1378 if lc in value_attrs or lc in boolean_attrs: 1379 k = lc 1380 if k in boolean_attrs and v is None: 1381 # boolean cookie-attribute is present, but has no value 1382 # (like "discard", rather than "port=80") 1383 v = True 1384 if k in standard: 1385 # only first value is significant 1386 continue 1387 if k == "domain": 1388 if v is None: 1389 debug(" missing value for domain attribute") 1390 bad_cookie = True 1391 break 1392 # RFC 2965 section 3.3.3 1393 v = v.lower() 1394 if k == "expires": 1395 if max_age_set: 1396 # Prefer max-age to expires (like Mozilla) 1397 continue 1398 if v is None: 1399 debug(" missing or invalid value for expires " 1400 "attribute: treating as session cookie") 1401 continue 1402 if k == "max-age": 1403 max_age_set = True 1404 try: 1405 v = int(v) 1406 except ValueError: 1407 debug(" missing or invalid (non-numeric) value for " 1408 "max-age attribute") 1409 bad_cookie = True 1410 break 1411 # convert RFC 2965 Max-Age to seconds since epoch 1412 # XXX Strictly you're supposed to follow RFC 2616 1413 # age-calculation rules. Remember that zero Max-Age is a 1414 # is a request to discard (old and new) cookie, though. 1415 k = "expires" 1416 v = self._now + v 1417 if (k in value_attrs) or (k in boolean_attrs): 1418 if (v is None and 1419 k not in ["port", "comment", "commenturl"]): 1420 debug(" missing value for %s attribute" % k) 1421 bad_cookie = True 1422 break 1423 standard[k] = v 1424 else: 1425 rest[k] = v 1426 1427 if bad_cookie: 1428 continue 1429 1430 cookie_tuples.append((name, value, standard, rest)) 1431 1432 return cookie_tuples 1433 1434 def _cookie_from_cookie_tuple(self, tup, request): 1435 # standard is dict of standard cookie-attributes, rest is dict of the 1436 # rest of them 1437 name, value, standard, rest = tup 1438 1439 domain = standard.get("domain", Absent) 1440 path = standard.get("path", Absent) 1441 port = standard.get("port", Absent) 1442 expires = standard.get("expires", Absent) 1443 1444 # set the easy defaults 1445 version = standard.get("version", None) 1446 if version is not None: version = int(version) 1447 secure = standard.get("secure", False) 1448 # (discard is also set if expires is Absent) 1449 discard = standard.get("discard", False) 1450 comment = standard.get("comment", None) 1451 comment_url = standard.get("commenturl", None) 1452 1453 # set default path 1454 if path is not Absent and path != "": 1455 path_specified = True 1456 path = escape_path(path) 1457 else: 1458 path_specified = False 1459 path = request_path(request) 1460 i = path.rfind("/") 1461 if i != -1: 1462 if version == 0: 1463 # Netscape spec parts company from reality here 1464 path = path[:i] 1465 else: 1466 path = path[:i+1] 1467 if len(path) == 0: path = "/" 1468 1469 # set default domain 1470 domain_specified = domain is not Absent 1471 # but first we have to remember whether it starts with a dot 1472 domain_initial_dot = False 1473 if domain_specified: 1474 domain_initial_dot = bool(domain.startswith(".")) 1475 if domain is Absent: 1476 req_host, erhn = eff_request_host(request) 1477 domain = erhn 1478 elif not domain.startswith("."): 1479 domain = "."+domain 1480 1481 # set default port 1482 port_specified = False 1483 if port is not Absent: 1484 if port is None: 1485 # Port attr present, but has no value: default to request port. 1486 # Cookie should then only be sent back on that port. 1487 port = request_port(request) 1488 else: 1489 port_specified = True 1490 port = re.sub(r"\s+", "", port) 1491 else: 1492 # No port attr present. Cookie can be sent back on any port. 1493 port = None 1494 1495 # set default expires and discard 1496 if expires is Absent: 1497 expires = None 1498 discard = True 1499 elif expires <= self._now: 1500 # Expiry date in past is request to delete cookie. This can't be 1501 # in DefaultCookiePolicy, because can't delete cookies there. 1502 try: 1503 self.clear(domain, path, name) 1504 except KeyError: 1505 pass 1506 debug("Expiring cookie, domain='%s', path='%s', name='%s'", 1507 domain, path, name) 1508 return None 1509 1510 return Cookie(version, 1511 name, value, 1512 port, port_specified, 1513 domain, domain_specified, domain_initial_dot, 1514 path, path_specified, 1515 secure, 1516 expires, 1517 discard, 1518 comment, 1519 comment_url, 1520 rest) 1521 1522 def _cookies_from_attrs_set(self, attrs_set, request): 1523 cookie_tuples = self._normalized_cookie_tuples(attrs_set) 1524 1525 cookies = [] 1526 for tup in cookie_tuples: 1527 cookie = self._cookie_from_cookie_tuple(tup, request) 1528 if cookie: cookies.append(cookie) 1529 return cookies 1530 1531 def make_cookies(self, response, request): 1532 """Return sequence of Cookie objects extracted from response object.""" 1533 # get cookie-attributes for RFC 2965 and Netscape protocols 1534 headers = response.info() 1535 rfc2965_hdrs = headers.getheaders("Set-Cookie2") 1536 ns_hdrs = headers.getheaders("Set-Cookie") 1537 1538 rfc2965 = self._policy.rfc2965 1539 netscape = self._policy.netscape 1540 1541 if ((not rfc2965_hdrs and not ns_hdrs) or 1542 (not ns_hdrs and not rfc2965) or 1543 (not rfc2965_hdrs and not netscape) or 1544 (not netscape and not rfc2965)): 1545 return [] # no relevant cookie headers: quick exit 1546 1547 try: 1548 cookies = self._cookies_from_attrs_set( 1549 split_header_words(rfc2965_hdrs), request) 1550 except: 1551 reraise_unmasked_exceptions() 1552 cookies = [] 1553 1554 if ns_hdrs and netscape: 1555 try: 1556 ns_cookies = self._cookies_from_attrs_set( 1557 parse_ns_headers(ns_hdrs), request) 1558 except: 1559 reraise_unmasked_exceptions() 1560 ns_cookies = [] 1561 1562 # Look for Netscape cookies (from Set-Cookie headers) that match 1563 # corresponding RFC 2965 cookies (from Set-Cookie2 headers). 1564 # For each match, keep the RFC 2965 cookie and ignore the Netscape 1565 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are 1566 # bundled in with the Netscape cookies for this purpose, which is 1567 # reasonable behaviour. 1568 if rfc2965: 1569 lookup = {} 1570 for cookie in cookies: 1571 lookup[(cookie.domain, cookie.path, cookie.name)] = None 1572 1573 def no_matching_rfc2965(ns_cookie, lookup=lookup): 1574 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name 1575 return key not in lookup 1576 ns_cookies = filter(no_matching_rfc2965, ns_cookies) 1577 1578 if ns_cookies: 1579 cookies.extend(ns_cookies) 1580 1581 return cookies 1582 1583 def set_cookie_if_ok(self, cookie, request): 1584 """Set a cookie if policy says it's OK to do so.""" 1585 self._cookies_lock.acquire() 1586 self._policy._now = self._now = int(time.time()) 1587 1588 if self._policy.set_ok(cookie, request): 1589 self.set_cookie(cookie) 1590 1591 self._cookies_lock.release() 1592 1593 def set_cookie(self, cookie): 1594 """Set a cookie, without checking whether or not it should be set.""" 1595 c = self._cookies 1596 self._cookies_lock.acquire() 1597 try: 1598 if cookie.domain not in c: c[cookie.domain] = {} 1599 c2 = c[cookie.domain] 1600 if cookie.path not in c2: c2[cookie.path] = {} 1601 c3 = c2[cookie.path] 1602 c3[cookie.name] = cookie 1603 finally: 1604 self._cookies_lock.release() 1605 1606 def extract_cookies(self, response, request): 1607 """Extract cookies from response, where allowable given the request.""" 1608 debug("extract_cookies: %s", response.info()) 1609 self._cookies_lock.acquire() 1610 self._policy._now = self._now = int(time.time()) 1611 1612 for cookie in self.make_cookies(response, request): 1613 if self._policy.set_ok(cookie, request): 1614 debug(" setting cookie: %s", cookie) 1615 self.set_cookie(cookie) 1616 self._cookies_lock.release() 1617 1618 def clear(self, domain=None, path=None, name=None): 1619 """Clear some cookies. 1620 1621 Invoking this method without arguments will clear all cookies. If 1622 given a single argument, only cookies belonging to that domain will be 1623 removed. If given two arguments, cookies belonging to the specified 1624 path within that domain are removed. If given three arguments, then 1625 the cookie with the specified name, path and domain is removed. 1626 1627 Raises KeyError if no matching cookie exists. 1628 1629 """ 1630 if name is not None: 1631 if (domain is None) or (path is None): 1632 raise ValueError( 1633 "domain and path must be given to remove a cookie by name") 1634 del self._cookies[domain][path][name] 1635 elif path is not None: 1636 if domain is None: 1637 raise ValueError( 1638 "domain must be given to remove cookies by path") 1639 del self._cookies[domain][path] 1640 elif domain is not None: 1641 del self._cookies[domain] 1642 else: 1643 self._cookies = {} 1644 1645 def clear_session_cookies(self): 1646 """Discard all session cookies. 1647 1648 Note that the .save() method won't save session cookies anyway, unless 1649 you ask otherwise by passing a true ignore_discard argument. 1650 1651 """ 1652 self._cookies_lock.acquire() 1653 for cookie in self: 1654 if cookie.discard: 1655 self.clear(cookie.domain, cookie.path, cookie.name) 1656 self._cookies_lock.release() 1657 1658 def clear_expired_cookies(self): 1659 """Discard all expired cookies. 1660 1661 You probably don't need to call this method: expired cookies are never 1662 sent back to the server (provided you're using DefaultCookiePolicy), 1663 this method is called by CookieJar itself every so often, and the 1664 .save() method won't save expired cookies anyway (unless you ask 1665 otherwise by passing a true ignore_expires argument). 1666 1667 """ 1668 self._cookies_lock.acquire() 1669 now = time.time() 1670 for cookie in self: 1671 if cookie.is_expired(now): 1672 self.clear(cookie.domain, cookie.path, cookie.name) 1673 self._cookies_lock.release() 1674 1675 def __iter__(self): 1676 return deepvalues(self._cookies) 1677 1678 def __len__(self): 1679 """Return number of contained cookies.""" 1680 i = 0 1681 for cookie in self: i = i + 1 1682 return i 1683 1684 def __repr__(self): 1685 r = [] 1686 for cookie in self: r.append(repr(cookie)) 1687 return "<%s[%s]>" % (self.__class__, ", ".join(r)) 1688 1689 def __str__(self): 1690 r = [] 1691 for cookie in self: r.append(str(cookie)) 1692 return "<%s[%s]>" % (self.__class__, ", ".join(r)) 1693 1694 1695 class LoadError(Exception): pass 1696 1697 class FileCookieJar(CookieJar): 1698 """CookieJar that can be loaded from and saved to a file.""" 1699 1700 def __init__(self, filename=None, delayload=False, policy=None): 1701 """ 1702 Cookies are NOT loaded from the named file until either the .load() or 1703 .revert() method is called. 1704 1705 """ 1706 CookieJar.__init__(self, policy) 1707 if filename is not None: 1708 try: 1709 filename+"" 1710 except: 1711 raise ValueError("filename must be string-like") 1712 self.filename = filename 1713 self.delayload = bool(delayload) 1714 1715 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1716 """Save cookies to a file.""" 1717 raise NotImplementedError() 1718 1719 def load(self, filename=None, ignore_discard=False, ignore_expires=False): 1720 """Load cookies from a file.""" 1721 if filename is None: 1722 if self.filename is not None: filename = self.filename 1723 else: raise ValueError(MISSING_FILENAME_TEXT) 1724 1725 f = open(filename) 1726 try: 1727 self._really_load(f, filename, ignore_discard, ignore_expires) 1728 finally: 1729 f.close() 1730 1731 def revert(self, filename=None, 1732 ignore_discard=False, ignore_expires=False): 1733 """Clear all cookies and reload cookies from a saved file. 1734 1735 Raises LoadError (or IOError) if reversion is not successful; the 1736 object's state will not be altered if this happens. 1737 1738 """ 1739 if filename is None: 1740 if self.filename is not None: filename = self.filename 1741 else: raise ValueError(MISSING_FILENAME_TEXT) 1742 1743 self._cookies_lock.acquire() 1744 1745 old_state = copy.deepcopy(self._cookies) 1746 self._cookies = {} 1747 try: 1748 self.load(filename, ignore_discard, ignore_expires) 1749 except (LoadError, IOError): 1750 self._cookies = old_state 1751 raise 1752 1753 self._cookies_lock.release() 1754 1755 from _LWPCookieJar import LWPCookieJar, lwp_cookie_str 1756 from _MozillaCookieJar import MozillaCookieJar 1757
Generated by PyXR 0.9.4