PyXR

c:\python24\lib \ email \ quopriMIME.py


0001 # Copyright (C) 2001-2004 Python Software Foundation
0002 # Author: Ben Gertzfield
0003 # Contact: email-sig@python.org
0004 
0005 """Quoted-printable content transfer encoding per RFCs 2045-2047.
0006 
0007 This module handles the content transfer encoding method defined in RFC 2045
0008 to encode US ASCII-like 8-bit data called `quoted-printable'.  It is used to
0009 safely encode text that is in a character set similar to the 7-bit US ASCII
0010 character set, but that includes some 8-bit characters that are normally not
0011 allowed in email bodies or headers.
0012 
0013 Quoted-printable is very space-inefficient for encoding binary files; use the
0014 email.base64MIME module for that instead.
0015 
0016 This module provides an interface to encode and decode both headers and bodies
0017 with quoted-printable encoding.
0018 
0019 RFC 2045 defines a method for including character set information in an
0020 `encoded-word' in a header.  This method is commonly used for 8-bit real names
0021 in To:/From:/Cc: etc. fields, as well as Subject: lines.
0022 
0023 This module does not do the line wrapping or end-of-line character
0024 conversion necessary for proper internationalized headers; it only
0025 does dumb encoding and decoding.  To deal with the various line
0026 wrapping issues, use the email.Header module.
0027 """
0028 
0029 import re
0030 from string import hexdigits
0031 from email.Utils import fix_eols
0032 
0033 CRLF = '\r\n'
0034 NL = '\n'
0035 
0036 # See also Charset.py
0037 MISC_LEN = 7
0038 
0039 hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]')
0040 bqre = re.compile(r'[^ !-<>-~\t]')
0041 
0042 
0043 
0044 # Helpers
0045 def header_quopri_check(c):
0046     """Return True if the character should be escaped with header quopri."""
0047     return bool(hqre.match(c))
0048 
0049 
0050 def body_quopri_check(c):
0051     """Return True if the character should be escaped with body quopri."""
0052     return bool(bqre.match(c))
0053 
0054 
0055 def header_quopri_len(s):
0056     """Return the length of str when it is encoded with header quopri."""
0057     count = 0
0058     for c in s:
0059         if hqre.match(c):
0060             count += 3
0061         else:
0062             count += 1
0063     return count
0064 
0065 
0066 def body_quopri_len(str):
0067     """Return the length of str when it is encoded with body quopri."""
0068     count = 0
0069     for c in str:
0070         if bqre.match(c):
0071             count += 3
0072         else:
0073             count += 1
0074     return count
0075 
0076 
0077 def _max_append(L, s, maxlen, extra=''):
0078     if not L:
0079         L.append(s.lstrip())
0080     elif len(L[-1]) + len(s) <= maxlen:
0081         L[-1] += extra + s
0082     else:
0083         L.append(s.lstrip())
0084 
0085 
0086 def unquote(s):
0087     """Turn a string in the form =AB to the ASCII character with value 0xab"""
0088     return chr(int(s[1:3], 16))
0089 
0090 
0091 def quote(c):
0092     return "=%02X" % ord(c)
0093 
0094 
0095 
0096 def header_encode(header, charset="iso-8859-1", keep_eols=False,
0097                   maxlinelen=76, eol=NL):
0098     """Encode a single header line with quoted-printable (like) encoding.
0099 
0100     Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
0101     used specifically for email header fields to allow charsets with mostly 7
0102     bit characters (and some 8 bit) to remain more or less readable in non-RFC
0103     2045 aware mail clients.
0104 
0105     charset names the character set to use to encode the header.  It defaults
0106     to iso-8859-1.
0107 
0108     The resulting string will be in the form:
0109 
0110     "=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n
0111       =?charset?q?Silly_=C8nglish_Kn=EEghts?="
0112 
0113     with each line wrapped safely at, at most, maxlinelen characters (defaults
0114     to 76 characters).  If maxlinelen is None, the entire string is encoded in
0115     one chunk with no splitting.
0116 
0117     End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted
0118     to the canonical email line separator \\r\\n unless the keep_eols
0119     parameter is True (the default is False).
0120 
0121     Each line of the header will be terminated in the value of eol, which
0122     defaults to "\\n".  Set this to "\\r\\n" if you are using the result of
0123     this function directly in email.
0124     """
0125     # Return empty headers unchanged
0126     if not header:
0127         return header
0128 
0129     if not keep_eols:
0130         header = fix_eols(header)
0131 
0132     # Quopri encode each line, in encoded chunks no greater than maxlinelen in
0133     # length, after the RFC chrome is added in.
0134     quoted = []
0135     if maxlinelen is None:
0136         # An obnoxiously large number that's good enough
0137         max_encoded = 100000
0138     else:
0139         max_encoded = maxlinelen - len(charset) - MISC_LEN - 1
0140 
0141     for c in header:
0142         # Space may be represented as _ instead of =20 for readability
0143         if c == ' ':
0144             _max_append(quoted, '_', max_encoded)
0145         # These characters can be included verbatim
0146         elif not hqre.match(c):
0147             _max_append(quoted, c, max_encoded)
0148         # Otherwise, replace with hex value like =E2
0149         else:
0150             _max_append(quoted, "=%02X" % ord(c), max_encoded)
0151 
0152     # Now add the RFC chrome to each encoded chunk and glue the chunks
0153     # together.  BAW: should we be able to specify the leading whitespace in
0154     # the joiner?
0155     joiner = eol + ' '
0156     return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted])
0157 
0158 
0159 
0160 def encode(body, binary=False, maxlinelen=76, eol=NL):
0161     """Encode with quoted-printable, wrapping at maxlinelen characters.
0162 
0163     If binary is False (the default), end-of-line characters will be converted
0164     to the canonical email end-of-line sequence \\r\\n.  Otherwise they will
0165     be left verbatim.
0166 
0167     Each line of encoded text will end with eol, which defaults to "\\n".  Set
0168     this to "\\r\\n" if you will be using the result of this function directly
0169     in an email.
0170 
0171     Each line will be wrapped at, at most, maxlinelen characters (defaults to
0172     76 characters).  Long lines will have the `soft linefeed' quoted-printable
0173     character "=" appended to them, so the decoded text will be identical to
0174     the original text.
0175     """
0176     if not body:
0177         return body
0178 
0179     if not binary:
0180         body = fix_eols(body)
0181 
0182     # BAW: We're accumulating the body text by string concatenation.  That
0183     # can't be very efficient, but I don't have time now to rewrite it.  It
0184     # just feels like this algorithm could be more efficient.
0185     encoded_body = ''
0186     lineno = -1
0187     # Preserve line endings here so we can check later to see an eol needs to
0188     # be added to the output later.
0189     lines = body.splitlines(1)
0190     for line in lines:
0191         # But strip off line-endings for processing this line.
0192         if line.endswith(CRLF):
0193             line = line[:-2]
0194         elif line[-1] in CRLF:
0195             line = line[:-1]
0196 
0197         lineno += 1
0198         encoded_line = ''
0199         prev = None
0200         linelen = len(line)
0201         # Now we need to examine every character to see if it needs to be
0202         # quopri encoded.  BAW: again, string concatenation is inefficient.
0203         for j in range(linelen):
0204             c = line[j]
0205             prev = c
0206             if bqre.match(c):
0207                 c = quote(c)
0208             elif j+1 == linelen:
0209                 # Check for whitespace at end of line; special case
0210                 if c not in ' \t':
0211                     encoded_line += c
0212                 prev = c
0213                 continue
0214             # Check to see to see if the line has reached its maximum length
0215             if len(encoded_line) + len(c) >= maxlinelen:
0216                 encoded_body += encoded_line + '=' + eol
0217                 encoded_line = ''
0218             encoded_line += c
0219         # Now at end of line..
0220         if prev and prev in ' \t':
0221             # Special case for whitespace at end of file
0222             if lineno + 1 == len(lines):
0223                 prev = quote(prev)
0224                 if len(encoded_line) + len(prev) > maxlinelen:
0225                     encoded_body += encoded_line + '=' + eol + prev
0226                 else:
0227                     encoded_body += encoded_line + prev
0228             # Just normal whitespace at end of line
0229             else:
0230                 encoded_body += encoded_line + prev + '=' + eol
0231             encoded_line = ''
0232         # Now look at the line we just finished and it has a line ending, we
0233         # need to add eol to the end of the line.
0234         if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF:
0235             encoded_body += encoded_line + eol
0236         else:
0237             encoded_body += encoded_line
0238         encoded_line = ''
0239     return encoded_body
0240 
0241 
0242 # For convenience and backwards compatibility w/ standard base64 module
0243 body_encode = encode
0244 encodestring = encode
0245 
0246 
0247 
0248 # BAW: I'm not sure if the intent was for the signature of this function to be
0249 # the same as base64MIME.decode() or not...
0250 def decode(encoded, eol=NL):
0251     """Decode a quoted-printable string.
0252 
0253     Lines are separated with eol, which defaults to \\n.
0254     """
0255     if not encoded:
0256         return encoded
0257     # BAW: see comment in encode() above.  Again, we're building up the
0258     # decoded string with string concatenation, which could be done much more
0259     # efficiently.
0260     decoded = ''
0261 
0262     for line in encoded.splitlines():
0263         line = line.rstrip()
0264         if not line:
0265             decoded += eol
0266             continue
0267 
0268         i = 0
0269         n = len(line)
0270         while i < n:
0271             c = line[i]
0272             if c <> '=':
0273                 decoded += c
0274                 i += 1
0275             # Otherwise, c == "=".  Are we at the end of the line?  If so, add
0276             # a soft line break.
0277             elif i+1 == n:
0278                 i += 1
0279                 continue
0280             # Decode if in form =AB
0281             elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
0282                 decoded += unquote(line[i:i+3])
0283                 i += 3
0284             # Otherwise, not in form =AB, pass literally
0285             else:
0286                 decoded += c
0287                 i += 1
0288 
0289             if i == n:
0290                 decoded += eol
0291     # Special case if original string did not end with eol
0292     if not encoded.endswith(eol) and decoded.endswith(eol):
0293         decoded = decoded[:-1]
0294     return decoded
0295 
0296 
0297 # For convenience and backwards compatibility w/ standard base64 module
0298 body_decode = decode
0299 decodestring = decode
0300 
0301 
0302 
0303 def _unquote_match(match):
0304     """Turn a match in the form =AB to the ASCII character with value 0xab"""
0305     s = match.group(0)
0306     return unquote(s)
0307 
0308 
0309 # Header decoding is done a bit differently
0310 def header_decode(s):
0311     """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
0312 
0313     This function does not parse a full MIME header value encoded with
0314     quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
0315     the high level email.Header class for that functionality.
0316     """
0317     s = s.replace('_', ' ')
0318     return re.sub(r'=\w{2}', _unquote_match, s)
0319
Generated by PyXR 0.9.4