0001 # Copyright (C) 2001-2004 Python Software Foundation 0002 # Author: Ben Gertzfield 0003 # Contact: email-sig@python.org 0004 0005 """Quoted-printable content transfer encoding per RFCs 2045-2047. 0006 0007 This module handles the content transfer encoding method defined in RFC 2045 0008 to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to 0009 safely encode text that is in a character set similar to the 7-bit US ASCII 0010 character set, but that includes some 8-bit characters that are normally not 0011 allowed in email bodies or headers. 0012 0013 Quoted-printable is very space-inefficient for encoding binary files; use the 0014 email.base64MIME module for that instead. 0015 0016 This module provides an interface to encode and decode both headers and bodies 0017 with quoted-printable encoding. 0018 0019 RFC 2045 defines a method for including character set information in an 0020 `encoded-word' in a header. This method is commonly used for 8-bit real names 0021 in To:/From:/Cc: etc. fields, as well as Subject: lines. 0022 0023 This module does not do the line wrapping or end-of-line character 0024 conversion necessary for proper internationalized headers; it only 0025 does dumb encoding and decoding. To deal with the various line 0026 wrapping issues, use the email.Header module. 0027 """ 0028 0029 import re 0030 from string import hexdigits 0031 from email.Utils import fix_eols 0032 0033 CRLF = '\r\n' 0034 NL = '\n' 0035 0036 # See also Charset.py 0037 MISC_LEN = 7 0038 0039 hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]') 0040 bqre = re.compile(r'[^ !-<>-~\t]') 0041 0042 0043 0044 # Helpers 0045 def header_quopri_check(c): 0046 """Return True if the character should be escaped with header quopri.""" 0047 return bool(hqre.match(c)) 0048 0049 0050 def body_quopri_check(c): 0051 """Return True if the character should be escaped with body quopri.""" 0052 return bool(bqre.match(c)) 0053 0054 0055 def header_quopri_len(s): 0056 """Return the length of str when it is encoded with header quopri.""" 0057 count = 0 0058 for c in s: 0059 if hqre.match(c): 0060 count += 3 0061 else: 0062 count += 1 0063 return count 0064 0065 0066 def body_quopri_len(str): 0067 """Return the length of str when it is encoded with body quopri.""" 0068 count = 0 0069 for c in str: 0070 if bqre.match(c): 0071 count += 3 0072 else: 0073 count += 1 0074 return count 0075 0076 0077 def _max_append(L, s, maxlen, extra=''): 0078 if not L: 0079 L.append(s.lstrip()) 0080 elif len(L[-1]) + len(s) <= maxlen: 0081 L[-1] += extra + s 0082 else: 0083 L.append(s.lstrip()) 0084 0085 0086 def unquote(s): 0087 """Turn a string in the form =AB to the ASCII character with value 0xab""" 0088 return chr(int(s[1:3], 16)) 0089 0090 0091 def quote(c): 0092 return "=%02X" % ord(c) 0093 0094 0095 0096 def header_encode(header, charset="iso-8859-1", keep_eols=False, 0097 maxlinelen=76, eol=NL): 0098 """Encode a single header line with quoted-printable (like) encoding. 0099 0100 Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but 0101 used specifically for email header fields to allow charsets with mostly 7 0102 bit characters (and some 8 bit) to remain more or less readable in non-RFC 0103 2045 aware mail clients. 0104 0105 charset names the character set to use to encode the header. It defaults 0106 to iso-8859-1. 0107 0108 The resulting string will be in the form: 0109 0110 "=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n 0111 =?charset?q?Silly_=C8nglish_Kn=EEghts?=" 0112 0113 with each line wrapped safely at, at most, maxlinelen characters (defaults 0114 to 76 characters). If maxlinelen is None, the entire string is encoded in 0115 one chunk with no splitting. 0116 0117 End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted 0118 to the canonical email line separator \\r\\n unless the keep_eols 0119 parameter is True (the default is False). 0120 0121 Each line of the header will be terminated in the value of eol, which 0122 defaults to "\\n". Set this to "\\r\\n" if you are using the result of 0123 this function directly in email. 0124 """ 0125 # Return empty headers unchanged 0126 if not header: 0127 return header 0128 0129 if not keep_eols: 0130 header = fix_eols(header) 0131 0132 # Quopri encode each line, in encoded chunks no greater than maxlinelen in 0133 # length, after the RFC chrome is added in. 0134 quoted = [] 0135 if maxlinelen is None: 0136 # An obnoxiously large number that's good enough 0137 max_encoded = 100000 0138 else: 0139 max_encoded = maxlinelen - len(charset) - MISC_LEN - 1 0140 0141 for c in header: 0142 # Space may be represented as _ instead of =20 for readability 0143 if c == ' ': 0144 _max_append(quoted, '_', max_encoded) 0145 # These characters can be included verbatim 0146 elif not hqre.match(c): 0147 _max_append(quoted, c, max_encoded) 0148 # Otherwise, replace with hex value like =E2 0149 else: 0150 _max_append(quoted, "=%02X" % ord(c), max_encoded) 0151 0152 # Now add the RFC chrome to each encoded chunk and glue the chunks 0153 # together. BAW: should we be able to specify the leading whitespace in 0154 # the joiner? 0155 joiner = eol + ' ' 0156 return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted]) 0157 0158 0159 0160 def encode(body, binary=False, maxlinelen=76, eol=NL): 0161 """Encode with quoted-printable, wrapping at maxlinelen characters. 0162 0163 If binary is False (the default), end-of-line characters will be converted 0164 to the canonical email end-of-line sequence \\r\\n. Otherwise they will 0165 be left verbatim. 0166 0167 Each line of encoded text will end with eol, which defaults to "\\n". Set 0168 this to "\\r\\n" if you will be using the result of this function directly 0169 in an email. 0170 0171 Each line will be wrapped at, at most, maxlinelen characters (defaults to 0172 76 characters). Long lines will have the `soft linefeed' quoted-printable 0173 character "=" appended to them, so the decoded text will be identical to 0174 the original text. 0175 """ 0176 if not body: 0177 return body 0178 0179 if not binary: 0180 body = fix_eols(body) 0181 0182 # BAW: We're accumulating the body text by string concatenation. That 0183 # can't be very efficient, but I don't have time now to rewrite it. It 0184 # just feels like this algorithm could be more efficient. 0185 encoded_body = '' 0186 lineno = -1 0187 # Preserve line endings here so we can check later to see an eol needs to 0188 # be added to the output later. 0189 lines = body.splitlines(1) 0190 for line in lines: 0191 # But strip off line-endings for processing this line. 0192 if line.endswith(CRLF): 0193 line = line[:-2] 0194 elif line[-1] in CRLF: 0195 line = line[:-1] 0196 0197 lineno += 1 0198 encoded_line = '' 0199 prev = None 0200 linelen = len(line) 0201 # Now we need to examine every character to see if it needs to be 0202 # quopri encoded. BAW: again, string concatenation is inefficient. 0203 for j in range(linelen): 0204 c = line[j] 0205 prev = c 0206 if bqre.match(c): 0207 c = quote(c) 0208 elif j+1 == linelen: 0209 # Check for whitespace at end of line; special case 0210 if c not in ' \t': 0211 encoded_line += c 0212 prev = c 0213 continue 0214 # Check to see to see if the line has reached its maximum length 0215 if len(encoded_line) + len(c) >= maxlinelen: 0216 encoded_body += encoded_line + '=' + eol 0217 encoded_line = '' 0218 encoded_line += c 0219 # Now at end of line.. 0220 if prev and prev in ' \t': 0221 # Special case for whitespace at end of file 0222 if lineno + 1 == len(lines): 0223 prev = quote(prev) 0224 if len(encoded_line) + len(prev) > maxlinelen: 0225 encoded_body += encoded_line + '=' + eol + prev 0226 else: 0227 encoded_body += encoded_line + prev 0228 # Just normal whitespace at end of line 0229 else: 0230 encoded_body += encoded_line + prev + '=' + eol 0231 encoded_line = '' 0232 # Now look at the line we just finished and it has a line ending, we 0233 # need to add eol to the end of the line. 0234 if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF: 0235 encoded_body += encoded_line + eol 0236 else: 0237 encoded_body += encoded_line 0238 encoded_line = '' 0239 return encoded_body 0240 0241 0242 # For convenience and backwards compatibility w/ standard base64 module 0243 body_encode = encode 0244 encodestring = encode 0245 0246 0247 0248 # BAW: I'm not sure if the intent was for the signature of this function to be 0249 # the same as base64MIME.decode() or not... 0250 def decode(encoded, eol=NL): 0251 """Decode a quoted-printable string. 0252 0253 Lines are separated with eol, which defaults to \\n. 0254 """ 0255 if not encoded: 0256 return encoded 0257 # BAW: see comment in encode() above. Again, we're building up the 0258 # decoded string with string concatenation, which could be done much more 0259 # efficiently. 0260 decoded = '' 0261 0262 for line in encoded.splitlines(): 0263 line = line.rstrip() 0264 if not line: 0265 decoded += eol 0266 continue 0267 0268 i = 0 0269 n = len(line) 0270 while i < n: 0271 c = line[i] 0272 if c <> '=': 0273 decoded += c 0274 i += 1 0275 # Otherwise, c == "=". Are we at the end of the line? If so, add 0276 # a soft line break. 0277 elif i+1 == n: 0278 i += 1 0279 continue 0280 # Decode if in form =AB 0281 elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits: 0282 decoded += unquote(line[i:i+3]) 0283 i += 3 0284 # Otherwise, not in form =AB, pass literally 0285 else: 0286 decoded += c 0287 i += 1 0288 0289 if i == n: 0290 decoded += eol 0291 # Special case if original string did not end with eol 0292 if not encoded.endswith(eol) and decoded.endswith(eol): 0293 decoded = decoded[:-1] 0294 return decoded 0295 0296 0297 # For convenience and backwards compatibility w/ standard base64 module 0298 body_decode = decode 0299 decodestring = decode 0300 0301 0302 0303 def _unquote_match(match): 0304 """Turn a match in the form =AB to the ASCII character with value 0xab""" 0305 s = match.group(0) 0306 return unquote(s) 0307 0308 0309 # Header decoding is done a bit differently 0310 def header_decode(s): 0311 """Decode a string encoded with RFC 2045 MIME header `Q' encoding. 0312 0313 This function does not parse a full MIME header value encoded with 0314 quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use 0315 the high level email.Header class for that functionality. 0316 """ 0317 s = s.replace('_', ' ') 0318 return re.sub(r'=\w{2}', _unquote_match, s) 0319
Generated by PyXR 0.9.4