0001 # Copyright (C) 2002-2004 Python Software Foundation 0002 # Author: Ben Gertzfield, Barry Warsaw 0003 # Contact: email-sig@python.org 0004 0005 """Header encoding and decoding functionality.""" 0006 0007 import re 0008 import binascii 0009 0010 import email.quopriMIME 0011 import email.base64MIME 0012 from email.Errors import HeaderParseError 0013 from email.Charset import Charset 0014 0015 NL = '\n' 0016 SPACE = ' ' 0017 USPACE = u' ' 0018 SPACE8 = ' ' * 8 0019 UEMPTYSTRING = u'' 0020 0021 MAXLINELEN = 76 0022 0023 USASCII = Charset('us-ascii') 0024 UTF8 = Charset('utf-8') 0025 0026 # Match encoded-word strings in the form =?charset?q?Hello_World?= 0027 ecre = re.compile(r''' 0028 =\? # literal =? 0029 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset 0030 \? # literal ? 0031 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive 0032 \? # literal ? 0033 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string 0034 \?= # literal ?= 0035 ''', re.VERBOSE | re.IGNORECASE) 0036 0037 # Field name regexp, including trailing colon, but not separating whitespace, 0038 # according to RFC 2822. Character range is from tilde to exclamation mark. 0039 # For use with .match() 0040 fcre = re.compile(r'[\041-\176]+:$') 0041 0042 0043 0044 # Helpers 0045 _max_append = email.quopriMIME._max_append 0046 0047 0048 0049 def decode_header(header): 0050 """Decode a message header value without converting charset. 0051 0052 Returns a list of (decoded_string, charset) pairs containing each of the 0053 decoded parts of the header. Charset is None for non-encoded parts of the 0054 header, otherwise a lower-case string containing the name of the character 0055 set specified in the encoded string. 0056 0057 An email.Errors.HeaderParseError may be raised when certain decoding error 0058 occurs (e.g. a base64 decoding exception). 0059 """ 0060 # If no encoding, just return the header 0061 header = str(header) 0062 if not ecre.search(header): 0063 return [(header, None)] 0064 decoded = [] 0065 dec = '' 0066 for line in header.splitlines(): 0067 # This line might not have an encoding in it 0068 if not ecre.search(line): 0069 decoded.append((line, None)) 0070 continue 0071 parts = ecre.split(line) 0072 while parts: 0073 unenc = parts.pop(0).strip() 0074 if unenc: 0075 # Should we continue a long line? 0076 if decoded and decoded[-1][1] is None: 0077 decoded[-1] = (decoded[-1][0] + SPACE + unenc, None) 0078 else: 0079 decoded.append((unenc, None)) 0080 if parts: 0081 charset, encoding = [s.lower() for s in parts[0:2]] 0082 encoded = parts[2] 0083 dec = None 0084 if encoding == 'q': 0085 dec = email.quopriMIME.header_decode(encoded) 0086 elif encoding == 'b': 0087 try: 0088 dec = email.base64MIME.decode(encoded) 0089 except binascii.Error: 0090 # Turn this into a higher level exception. BAW: Right 0091 # now we throw the lower level exception away but 0092 # when/if we get exception chaining, we'll preserve it. 0093 raise HeaderParseError 0094 if dec is None: 0095 dec = encoded 0096 0097 if decoded and decoded[-1][1] == charset: 0098 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1]) 0099 else: 0100 decoded.append((dec, charset)) 0101 del parts[0:3] 0102 return decoded 0103 0104 0105 0106 def make_header(decoded_seq, maxlinelen=None, header_name=None, 0107 continuation_ws=' '): 0108 """Create a Header from a sequence of pairs as returned by decode_header() 0109 0110 decode_header() takes a header value string and returns a sequence of 0111 pairs of the format (decoded_string, charset) where charset is the string 0112 name of the character set. 0113 0114 This function takes one of those sequence of pairs and returns a Header 0115 instance. Optional maxlinelen, header_name, and continuation_ws are as in 0116 the Header constructor. 0117 """ 0118 h = Header(maxlinelen=maxlinelen, header_name=header_name, 0119 continuation_ws=continuation_ws) 0120 for s, charset in decoded_seq: 0121 # None means us-ascii but we can simply pass it on to h.append() 0122 if charset is not None and not isinstance(charset, Charset): 0123 charset = Charset(charset) 0124 h.append(s, charset) 0125 return h 0126 0127 0128 0129 class Header: 0130 def __init__(self, s=None, charset=None, 0131 maxlinelen=None, header_name=None, 0132 continuation_ws=' ', errors='strict'): 0133 """Create a MIME-compliant header that can contain many character sets. 0134 0135 Optional s is the initial header value. If None, the initial header 0136 value is not set. You can later append to the header with .append() 0137 method calls. s may be a byte string or a Unicode string, but see the 0138 .append() documentation for semantics. 0139 0140 Optional charset serves two purposes: it has the same meaning as the 0141 charset argument to the .append() method. It also sets the default 0142 character set for all subsequent .append() calls that omit the charset 0143 argument. If charset is not provided in the constructor, the us-ascii 0144 charset is used both as s's initial charset and as the default for 0145 subsequent .append() calls. 0146 0147 The maximum line length can be specified explicit via maxlinelen. For 0148 splitting the first line to a shorter value (to account for the field 0149 header which isn't included in s, e.g. `Subject') pass in the name of 0150 the field in header_name. The default maxlinelen is 76. 0151 0152 continuation_ws must be RFC 2822 compliant folding whitespace (usually 0153 either a space or a hard tab) which will be prepended to continuation 0154 lines. 0155 0156 errors is passed through to the .append() call. 0157 """ 0158 if charset is None: 0159 charset = USASCII 0160 if not isinstance(charset, Charset): 0161 charset = Charset(charset) 0162 self._charset = charset 0163 self._continuation_ws = continuation_ws 0164 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8)) 0165 # BAW: I believe `chunks' and `maxlinelen' should be non-public. 0166 self._chunks = [] 0167 if s is not None: 0168 self.append(s, charset, errors) 0169 if maxlinelen is None: 0170 maxlinelen = MAXLINELEN 0171 if header_name is None: 0172 # We don't know anything about the field header so the first line 0173 # is the same length as subsequent lines. 0174 self._firstlinelen = maxlinelen 0175 else: 0176 # The first line should be shorter to take into account the field 0177 # header. Also subtract off 2 extra for the colon and space. 0178 self._firstlinelen = maxlinelen - len(header_name) - 2 0179 # Second and subsequent lines should subtract off the length in 0180 # columns of the continuation whitespace prefix. 0181 self._maxlinelen = maxlinelen - cws_expanded_len 0182 0183 def __str__(self): 0184 """A synonym for self.encode().""" 0185 return self.encode() 0186 0187 def __unicode__(self): 0188 """Helper for the built-in unicode function.""" 0189 uchunks = [] 0190 lastcs = None 0191 for s, charset in self._chunks: 0192 # We must preserve spaces between encoded and non-encoded word 0193 # boundaries, which means for us we need to add a space when we go 0194 # from a charset to None/us-ascii, or from None/us-ascii to a 0195 # charset. Only do this for the second and subsequent chunks. 0196 nextcs = charset 0197 if uchunks: 0198 if lastcs not in (None, 'us-ascii'): 0199 if nextcs in (None, 'us-ascii'): 0200 uchunks.append(USPACE) 0201 nextcs = None 0202 elif nextcs not in (None, 'us-ascii'): 0203 uchunks.append(USPACE) 0204 lastcs = nextcs 0205 uchunks.append(unicode(s, str(charset))) 0206 return UEMPTYSTRING.join(uchunks) 0207 0208 # Rich comparison operators for equality only. BAW: does it make sense to 0209 # have or explicitly disable <, <=, >, >= operators? 0210 def __eq__(self, other): 0211 # other may be a Header or a string. Both are fine so coerce 0212 # ourselves to a string, swap the args and do another comparison. 0213 return other == self.encode() 0214 0215 def __ne__(self, other): 0216 return not self == other 0217 0218 def append(self, s, charset=None, errors='strict'): 0219 """Append a string to the MIME header. 0220 0221 Optional charset, if given, should be a Charset instance or the name 0222 of a character set (which will be converted to a Charset instance). A 0223 value of None (the default) means that the charset given in the 0224 constructor is used. 0225 0226 s may be a byte string or a Unicode string. If it is a byte string 0227 (i.e. isinstance(s, str) is true), then charset is the encoding of 0228 that byte string, and a UnicodeError will be raised if the string 0229 cannot be decoded with that charset. If s is a Unicode string, then 0230 charset is a hint specifying the character set of the characters in 0231 the string. In this case, when producing an RFC 2822 compliant header 0232 using RFC 2047 rules, the Unicode string will be encoded using the 0233 following charsets in order: us-ascii, the charset hint, utf-8. The 0234 first character set not to provoke a UnicodeError is used. 0235 0236 Optional `errors' is passed as the third argument to any unicode() or 0237 ustr.encode() call. 0238 """ 0239 if charset is None: 0240 charset = self._charset 0241 elif not isinstance(charset, Charset): 0242 charset = Charset(charset) 0243 # If the charset is our faux 8bit charset, leave the string unchanged 0244 if charset <> '8bit': 0245 # We need to test that the string can be converted to unicode and 0246 # back to a byte string, given the input and output codecs of the 0247 # charset. 0248 if isinstance(s, str): 0249 # Possibly raise UnicodeError if the byte string can't be 0250 # converted to a unicode with the input codec of the charset. 0251 incodec = charset.input_codec or 'us-ascii' 0252 ustr = unicode(s, incodec, errors) 0253 # Now make sure that the unicode could be converted back to a 0254 # byte string with the output codec, which may be different 0255 # than the iput coded. Still, use the original byte string. 0256 outcodec = charset.output_codec or 'us-ascii' 0257 ustr.encode(outcodec, errors) 0258 elif isinstance(s, unicode): 0259 # Now we have to be sure the unicode string can be converted 0260 # to a byte string with a reasonable output codec. We want to 0261 # use the byte string in the chunk. 0262 for charset in USASCII, charset, UTF8: 0263 try: 0264 outcodec = charset.output_codec or 'us-ascii' 0265 s = s.encode(outcodec, errors) 0266 break 0267 except UnicodeError: 0268 pass 0269 else: 0270 assert False, 'utf-8 conversion failed' 0271 self._chunks.append((s, charset)) 0272 0273 def _split(self, s, charset, maxlinelen, splitchars): 0274 # Split up a header safely for use with encode_chunks. 0275 splittable = charset.to_splittable(s) 0276 encoded = charset.from_splittable(splittable, True) 0277 elen = charset.encoded_header_len(encoded) 0278 # If the line's encoded length first, just return it 0279 if elen <= maxlinelen: 0280 return [(encoded, charset)] 0281 # If we have undetermined raw 8bit characters sitting in a byte 0282 # string, we really don't know what the right thing to do is. We 0283 # can't really split it because it might be multibyte data which we 0284 # could break if we split it between pairs. The least harm seems to 0285 # be to not split the header at all, but that means they could go out 0286 # longer than maxlinelen. 0287 if charset == '8bit': 0288 return [(s, charset)] 0289 # BAW: I'm not sure what the right test here is. What we're trying to 0290 # do is be faithful to RFC 2822's recommendation that ($2.2.3): 0291 # 0292 # "Note: Though structured field bodies are defined in such a way that 0293 # folding can take place between many of the lexical tokens (and even 0294 # within some of the lexical tokens), folding SHOULD be limited to 0295 # placing the CRLF at higher-level syntactic breaks." 0296 # 0297 # For now, I can only imagine doing this when the charset is us-ascii, 0298 # although it's possible that other charsets may also benefit from the 0299 # higher-level syntactic breaks. 0300 elif charset == 'us-ascii': 0301 return self._split_ascii(s, charset, maxlinelen, splitchars) 0302 # BAW: should we use encoded? 0303 elif elen == len(s): 0304 # We can split on _maxlinelen boundaries because we know that the 0305 # encoding won't change the size of the string 0306 splitpnt = maxlinelen 0307 first = charset.from_splittable(splittable[:splitpnt], False) 0308 last = charset.from_splittable(splittable[splitpnt:], False) 0309 else: 0310 # Binary search for split point 0311 first, last = _binsplit(splittable, charset, maxlinelen) 0312 # first is of the proper length so just wrap it in the appropriate 0313 # chrome. last must be recursively split. 0314 fsplittable = charset.to_splittable(first) 0315 fencoded = charset.from_splittable(fsplittable, True) 0316 chunk = [(fencoded, charset)] 0317 return chunk + self._split(last, charset, self._maxlinelen, splitchars) 0318 0319 def _split_ascii(self, s, charset, firstlen, splitchars): 0320 chunks = _split_ascii(s, firstlen, self._maxlinelen, 0321 self._continuation_ws, splitchars) 0322 return zip(chunks, [charset]*len(chunks)) 0323 0324 def _encode_chunks(self, newchunks, maxlinelen): 0325 # MIME-encode a header with many different charsets and/or encodings. 0326 # 0327 # Given a list of pairs (string, charset), return a MIME-encoded 0328 # string suitable for use in a header field. Each pair may have 0329 # different charsets and/or encodings, and the resulting header will 0330 # accurately reflect each setting. 0331 # 0332 # Each encoding can be email.Utils.QP (quoted-printable, for 0333 # ASCII-like character sets like iso-8859-1), email.Utils.BASE64 0334 # (Base64, for non-ASCII like character sets like KOI8-R and 0335 # iso-2022-jp), or None (no encoding). 0336 # 0337 # Each pair will be represented on a separate line; the resulting 0338 # string will be in the format: 0339 # 0340 # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n 0341 # =?charset2?b?SvxyZ2VuIEL2aW5n?=" 0342 chunks = [] 0343 for header, charset in newchunks: 0344 if not header: 0345 continue 0346 if charset is None or charset.header_encoding is None: 0347 s = header 0348 else: 0349 s = charset.header_encode(header) 0350 # Don't add more folding whitespace than necessary 0351 if chunks and chunks[-1].endswith(' '): 0352 extra = '' 0353 else: 0354 extra = ' ' 0355 _max_append(chunks, s, maxlinelen, extra) 0356 joiner = NL + self._continuation_ws 0357 return joiner.join(chunks) 0358 0359 def encode(self, splitchars=';, '): 0360 """Encode a message header into an RFC-compliant format. 0361 0362 There are many issues involved in converting a given string for use in 0363 an email header. Only certain character sets are readable in most 0364 email clients, and as header strings can only contain a subset of 0365 7-bit ASCII, care must be taken to properly convert and encode (with 0366 Base64 or quoted-printable) header strings. In addition, there is a 0367 75-character length limit on any given encoded header field, so 0368 line-wrapping must be performed, even with double-byte character sets. 0369 0370 This method will do its best to convert the string to the correct 0371 character set used in email, and encode and line wrap it safely with 0372 the appropriate scheme for that character set. 0373 0374 If the given charset is not known or an error occurs during 0375 conversion, this function will return the header untouched. 0376 0377 Optional splitchars is a string containing characters to split long 0378 ASCII lines on, in rough support of RFC 2822's `highest level 0379 syntactic breaks'. This doesn't affect RFC 2047 encoded lines. 0380 """ 0381 newchunks = [] 0382 maxlinelen = self._firstlinelen 0383 lastlen = 0 0384 for s, charset in self._chunks: 0385 # The first bit of the next chunk should be just long enough to 0386 # fill the next line. Don't forget the space separating the 0387 # encoded words. 0388 targetlen = maxlinelen - lastlen - 1 0389 if targetlen < charset.encoded_header_len(''): 0390 # Stick it on the next line 0391 targetlen = maxlinelen 0392 newchunks += self._split(s, charset, targetlen, splitchars) 0393 lastchunk, lastcharset = newchunks[-1] 0394 lastlen = lastcharset.encoded_header_len(lastchunk) 0395 return self._encode_chunks(newchunks, maxlinelen) 0396 0397 0398 0399 def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars): 0400 lines = [] 0401 maxlen = firstlen 0402 for line in s.splitlines(): 0403 # Ignore any leading whitespace (i.e. continuation whitespace) already 0404 # on the line, since we'll be adding our own. 0405 line = line.lstrip() 0406 if len(line) < maxlen: 0407 lines.append(line) 0408 maxlen = restlen 0409 continue 0410 # Attempt to split the line at the highest-level syntactic break 0411 # possible. Note that we don't have a lot of smarts about field 0412 # syntax; we just try to break on semi-colons, then commas, then 0413 # whitespace. 0414 for ch in splitchars: 0415 if ch in line: 0416 break 0417 else: 0418 # There's nothing useful to split the line on, not even spaces, so 0419 # just append this line unchanged 0420 lines.append(line) 0421 maxlen = restlen 0422 continue 0423 # Now split the line on the character plus trailing whitespace 0424 cre = re.compile(r'%s\s*' % ch) 0425 if ch in ';,': 0426 eol = ch 0427 else: 0428 eol = '' 0429 joiner = eol + ' ' 0430 joinlen = len(joiner) 0431 wslen = len(continuation_ws.replace('\t', SPACE8)) 0432 this = [] 0433 linelen = 0 0434 for part in cre.split(line): 0435 curlen = linelen + max(0, len(this)-1) * joinlen 0436 partlen = len(part) 0437 onfirstline = not lines 0438 # We don't want to split after the field name, if we're on the 0439 # first line and the field name is present in the header string. 0440 if ch == ' ' and onfirstline and \ 0441 len(this) == 1 and fcre.match(this[0]): 0442 this.append(part) 0443 linelen += partlen 0444 elif curlen + partlen > maxlen: 0445 if this: 0446 lines.append(joiner.join(this) + eol) 0447 # If this part is longer than maxlen and we aren't already 0448 # splitting on whitespace, try to recursively split this line 0449 # on whitespace. 0450 if partlen > maxlen and ch <> ' ': 0451 subl = _split_ascii(part, maxlen, restlen, 0452 continuation_ws, ' ') 0453 lines.extend(subl[:-1]) 0454 this = [subl[-1]] 0455 else: 0456 this = [part] 0457 linelen = wslen + len(this[-1]) 0458 maxlen = restlen 0459 else: 0460 this.append(part) 0461 linelen += partlen 0462 # Put any left over parts on a line by themselves 0463 if this: 0464 lines.append(joiner.join(this)) 0465 return lines 0466 0467 0468 0469 def _binsplit(splittable, charset, maxlinelen): 0470 i = 0 0471 j = len(splittable) 0472 while i < j: 0473 # Invariants: 0474 # 1. splittable[:k] fits for all k <= i (note that we *assume*, 0475 # at the start, that splittable[:0] fits). 0476 # 2. splittable[:k] does not fit for any k > j (at the start, 0477 # this means we shouldn't look at any k > len(splittable)). 0478 # 3. We don't know about splittable[:k] for k in i+1..j. 0479 # 4. We want to set i to the largest k that fits, with i <= k <= j. 0480 # 0481 m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j 0482 chunk = charset.from_splittable(splittable[:m], True) 0483 chunklen = charset.encoded_header_len(chunk) 0484 if chunklen <= maxlinelen: 0485 # m is acceptable, so is a new lower bound. 0486 i = m 0487 else: 0488 # m is not acceptable, so final i must be < m. 0489 j = m - 1 0490 # i == j. Invariant #1 implies that splittable[:i] fits, and 0491 # invariant #2 implies that splittable[:i+1] does not fit, so i 0492 # is what we're looking for. 0493 first = charset.from_splittable(splittable[:i], False) 0494 last = charset.from_splittable(splittable[i:], False) 0495 return first, last 0496
Generated by PyXR 0.9.4