PyXR

c:\python24\lib \ email \ Header.py


0001 # Copyright (C) 2002-2004 Python Software Foundation
0002 # Author: Ben Gertzfield, Barry Warsaw
0003 # Contact: email-sig@python.org
0004 
0005 """Header encoding and decoding functionality."""
0006 
0007 import re
0008 import binascii
0009 
0010 import email.quopriMIME
0011 import email.base64MIME
0012 from email.Errors import HeaderParseError
0013 from email.Charset import Charset
0014 
0015 NL = '\n'
0016 SPACE = ' '
0017 USPACE = u' '
0018 SPACE8 = ' ' * 8
0019 UEMPTYSTRING = u''
0020 
0021 MAXLINELEN = 76
0022 
0023 USASCII = Charset('us-ascii')
0024 UTF8 = Charset('utf-8')
0025 
0026 # Match encoded-word strings in the form =?charset?q?Hello_World?=
0027 ecre = re.compile(r'''
0028   =\?                   # literal =?
0029   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
0030   \?                    # literal ?
0031   (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
0032   \?                    # literal ?
0033   (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
0034   \?=                   # literal ?=
0035   ''', re.VERBOSE | re.IGNORECASE)
0036 
0037 # Field name regexp, including trailing colon, but not separating whitespace,
0038 # according to RFC 2822.  Character range is from tilde to exclamation mark.
0039 # For use with .match()
0040 fcre = re.compile(r'[\041-\176]+:$')
0041 
0042 
0043 
0044 # Helpers
0045 _max_append = email.quopriMIME._max_append
0046 
0047 
0048 
0049 def decode_header(header):
0050     """Decode a message header value without converting charset.
0051 
0052     Returns a list of (decoded_string, charset) pairs containing each of the
0053     decoded parts of the header.  Charset is None for non-encoded parts of the
0054     header, otherwise a lower-case string containing the name of the character
0055     set specified in the encoded string.
0056 
0057     An email.Errors.HeaderParseError may be raised when certain decoding error
0058     occurs (e.g. a base64 decoding exception).
0059     """
0060     # If no encoding, just return the header
0061     header = str(header)
0062     if not ecre.search(header):
0063         return [(header, None)]
0064     decoded = []
0065     dec = ''
0066     for line in header.splitlines():
0067         # This line might not have an encoding in it
0068         if not ecre.search(line):
0069             decoded.append((line, None))
0070             continue
0071         parts = ecre.split(line)
0072         while parts:
0073             unenc = parts.pop(0).strip()
0074             if unenc:
0075                 # Should we continue a long line?
0076                 if decoded and decoded[-1][1] is None:
0077                     decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
0078                 else:
0079                     decoded.append((unenc, None))
0080             if parts:
0081                 charset, encoding = [s.lower() for s in parts[0:2]]
0082                 encoded = parts[2]
0083                 dec = None
0084                 if encoding == 'q':
0085                     dec = email.quopriMIME.header_decode(encoded)
0086                 elif encoding == 'b':
0087                     try:
0088                         dec = email.base64MIME.decode(encoded)
0089                     except binascii.Error:
0090                         # Turn this into a higher level exception.  BAW: Right
0091                         # now we throw the lower level exception away but
0092                         # when/if we get exception chaining, we'll preserve it.
0093                         raise HeaderParseError
0094                 if dec is None:
0095                     dec = encoded
0096 
0097                 if decoded and decoded[-1][1] == charset:
0098                     decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
0099                 else:
0100                     decoded.append((dec, charset))
0101             del parts[0:3]
0102     return decoded
0103 
0104 
0105 
0106 def make_header(decoded_seq, maxlinelen=None, header_name=None,
0107                 continuation_ws=' '):
0108     """Create a Header from a sequence of pairs as returned by decode_header()
0109 
0110     decode_header() takes a header value string and returns a sequence of
0111     pairs of the format (decoded_string, charset) where charset is the string
0112     name of the character set.
0113 
0114     This function takes one of those sequence of pairs and returns a Header
0115     instance.  Optional maxlinelen, header_name, and continuation_ws are as in
0116     the Header constructor.
0117     """
0118     h = Header(maxlinelen=maxlinelen, header_name=header_name,
0119                continuation_ws=continuation_ws)
0120     for s, charset in decoded_seq:
0121         # None means us-ascii but we can simply pass it on to h.append()
0122         if charset is not None and not isinstance(charset, Charset):
0123             charset = Charset(charset)
0124         h.append(s, charset)
0125     return h
0126 
0127 
0128 
0129 class Header:
0130     def __init__(self, s=None, charset=None,
0131                  maxlinelen=None, header_name=None,
0132                  continuation_ws=' ', errors='strict'):
0133         """Create a MIME-compliant header that can contain many character sets.
0134 
0135         Optional s is the initial header value.  If None, the initial header
0136         value is not set.  You can later append to the header with .append()
0137         method calls.  s may be a byte string or a Unicode string, but see the
0138         .append() documentation for semantics.
0139 
0140         Optional charset serves two purposes: it has the same meaning as the
0141         charset argument to the .append() method.  It also sets the default
0142         character set for all subsequent .append() calls that omit the charset
0143         argument.  If charset is not provided in the constructor, the us-ascii
0144         charset is used both as s's initial charset and as the default for
0145         subsequent .append() calls.
0146 
0147         The maximum line length can be specified explicit via maxlinelen.  For
0148         splitting the first line to a shorter value (to account for the field
0149         header which isn't included in s, e.g. `Subject') pass in the name of
0150         the field in header_name.  The default maxlinelen is 76.
0151 
0152         continuation_ws must be RFC 2822 compliant folding whitespace (usually
0153         either a space or a hard tab) which will be prepended to continuation
0154         lines.
0155 
0156         errors is passed through to the .append() call.
0157         """
0158         if charset is None:
0159             charset = USASCII
0160         if not isinstance(charset, Charset):
0161             charset = Charset(charset)
0162         self._charset = charset
0163         self._continuation_ws = continuation_ws
0164         cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
0165         # BAW: I believe `chunks' and `maxlinelen' should be non-public.
0166         self._chunks = []
0167         if s is not None:
0168             self.append(s, charset, errors)
0169         if maxlinelen is None:
0170             maxlinelen = MAXLINELEN
0171         if header_name is None:
0172             # We don't know anything about the field header so the first line
0173             # is the same length as subsequent lines.
0174             self._firstlinelen = maxlinelen
0175         else:
0176             # The first line should be shorter to take into account the field
0177             # header.  Also subtract off 2 extra for the colon and space.
0178             self._firstlinelen = maxlinelen - len(header_name) - 2
0179         # Second and subsequent lines should subtract off the length in
0180         # columns of the continuation whitespace prefix.
0181         self._maxlinelen = maxlinelen - cws_expanded_len
0182 
0183     def __str__(self):
0184         """A synonym for self.encode()."""
0185         return self.encode()
0186 
0187     def __unicode__(self):
0188         """Helper for the built-in unicode function."""
0189         uchunks = []
0190         lastcs = None
0191         for s, charset in self._chunks:
0192             # We must preserve spaces between encoded and non-encoded word
0193             # boundaries, which means for us we need to add a space when we go
0194             # from a charset to None/us-ascii, or from None/us-ascii to a
0195             # charset.  Only do this for the second and subsequent chunks.
0196             nextcs = charset
0197             if uchunks:
0198                 if lastcs not in (None, 'us-ascii'):
0199                     if nextcs in (None, 'us-ascii'):
0200                         uchunks.append(USPACE)
0201                         nextcs = None
0202                 elif nextcs not in (None, 'us-ascii'):
0203                     uchunks.append(USPACE)
0204             lastcs = nextcs
0205             uchunks.append(unicode(s, str(charset)))
0206         return UEMPTYSTRING.join(uchunks)
0207 
0208     # Rich comparison operators for equality only.  BAW: does it make sense to
0209     # have or explicitly disable <, <=, >, >= operators?
0210     def __eq__(self, other):
0211         # other may be a Header or a string.  Both are fine so coerce
0212         # ourselves to a string, swap the args and do another comparison.
0213         return other == self.encode()
0214 
0215     def __ne__(self, other):
0216         return not self == other
0217 
0218     def append(self, s, charset=None, errors='strict'):
0219         """Append a string to the MIME header.
0220 
0221         Optional charset, if given, should be a Charset instance or the name
0222         of a character set (which will be converted to a Charset instance).  A
0223         value of None (the default) means that the charset given in the
0224         constructor is used.
0225 
0226         s may be a byte string or a Unicode string.  If it is a byte string
0227         (i.e. isinstance(s, str) is true), then charset is the encoding of
0228         that byte string, and a UnicodeError will be raised if the string
0229         cannot be decoded with that charset.  If s is a Unicode string, then
0230         charset is a hint specifying the character set of the characters in
0231         the string.  In this case, when producing an RFC 2822 compliant header
0232         using RFC 2047 rules, the Unicode string will be encoded using the
0233         following charsets in order: us-ascii, the charset hint, utf-8.  The
0234         first character set not to provoke a UnicodeError is used.
0235 
0236         Optional `errors' is passed as the third argument to any unicode() or
0237         ustr.encode() call.
0238         """
0239         if charset is None:
0240             charset = self._charset
0241         elif not isinstance(charset, Charset):
0242             charset = Charset(charset)
0243         # If the charset is our faux 8bit charset, leave the string unchanged
0244         if charset <> '8bit':
0245             # We need to test that the string can be converted to unicode and
0246             # back to a byte string, given the input and output codecs of the
0247             # charset.
0248             if isinstance(s, str):
0249                 # Possibly raise UnicodeError if the byte string can't be
0250                 # converted to a unicode with the input codec of the charset.
0251                 incodec = charset.input_codec or 'us-ascii'
0252                 ustr = unicode(s, incodec, errors)
0253                 # Now make sure that the unicode could be converted back to a
0254                 # byte string with the output codec, which may be different
0255                 # than the iput coded.  Still, use the original byte string.
0256                 outcodec = charset.output_codec or 'us-ascii'
0257                 ustr.encode(outcodec, errors)
0258             elif isinstance(s, unicode):
0259                 # Now we have to be sure the unicode string can be converted
0260                 # to a byte string with a reasonable output codec.  We want to
0261                 # use the byte string in the chunk.
0262                 for charset in USASCII, charset, UTF8:
0263                     try:
0264                         outcodec = charset.output_codec or 'us-ascii'
0265                         s = s.encode(outcodec, errors)
0266                         break
0267                     except UnicodeError:
0268                         pass
0269                 else:
0270                     assert False, 'utf-8 conversion failed'
0271         self._chunks.append((s, charset))
0272 
0273     def _split(self, s, charset, maxlinelen, splitchars):
0274         # Split up a header safely for use with encode_chunks.
0275         splittable = charset.to_splittable(s)
0276         encoded = charset.from_splittable(splittable, True)
0277         elen = charset.encoded_header_len(encoded)
0278         # If the line's encoded length first, just return it
0279         if elen <= maxlinelen:
0280             return [(encoded, charset)]
0281         # If we have undetermined raw 8bit characters sitting in a byte
0282         # string, we really don't know what the right thing to do is.  We
0283         # can't really split it because it might be multibyte data which we
0284         # could break if we split it between pairs.  The least harm seems to
0285         # be to not split the header at all, but that means they could go out
0286         # longer than maxlinelen.
0287         if charset == '8bit':
0288             return [(s, charset)]
0289         # BAW: I'm not sure what the right test here is.  What we're trying to
0290         # do is be faithful to RFC 2822's recommendation that ($2.2.3):
0291         #
0292         # "Note: Though structured field bodies are defined in such a way that
0293         #  folding can take place between many of the lexical tokens (and even
0294         #  within some of the lexical tokens), folding SHOULD be limited to
0295         #  placing the CRLF at higher-level syntactic breaks."
0296         #
0297         # For now, I can only imagine doing this when the charset is us-ascii,
0298         # although it's possible that other charsets may also benefit from the
0299         # higher-level syntactic breaks.
0300         elif charset == 'us-ascii':
0301             return self._split_ascii(s, charset, maxlinelen, splitchars)
0302         # BAW: should we use encoded?
0303         elif elen == len(s):
0304             # We can split on _maxlinelen boundaries because we know that the
0305             # encoding won't change the size of the string
0306             splitpnt = maxlinelen
0307             first = charset.from_splittable(splittable[:splitpnt], False)
0308             last = charset.from_splittable(splittable[splitpnt:], False)
0309         else:
0310             # Binary search for split point
0311             first, last = _binsplit(splittable, charset, maxlinelen)
0312         # first is of the proper length so just wrap it in the appropriate
0313         # chrome.  last must be recursively split.
0314         fsplittable = charset.to_splittable(first)
0315         fencoded = charset.from_splittable(fsplittable, True)
0316         chunk = [(fencoded, charset)]
0317         return chunk + self._split(last, charset, self._maxlinelen, splitchars)
0318 
0319     def _split_ascii(self, s, charset, firstlen, splitchars):
0320         chunks = _split_ascii(s, firstlen, self._maxlinelen,
0321                               self._continuation_ws, splitchars)
0322         return zip(chunks, [charset]*len(chunks))
0323 
0324     def _encode_chunks(self, newchunks, maxlinelen):
0325         # MIME-encode a header with many different charsets and/or encodings.
0326         #
0327         # Given a list of pairs (string, charset), return a MIME-encoded
0328         # string suitable for use in a header field.  Each pair may have
0329         # different charsets and/or encodings, and the resulting header will
0330         # accurately reflect each setting.
0331         #
0332         # Each encoding can be email.Utils.QP (quoted-printable, for
0333         # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
0334         # (Base64, for non-ASCII like character sets like KOI8-R and
0335         # iso-2022-jp), or None (no encoding).
0336         #
0337         # Each pair will be represented on a separate line; the resulting
0338         # string will be in the format:
0339         #
0340         # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
0341         #  =?charset2?b?SvxyZ2VuIEL2aW5n?="
0342         chunks = []
0343         for header, charset in newchunks:
0344             if not header:
0345                 continue
0346             if charset is None or charset.header_encoding is None:
0347                 s = header
0348             else:
0349                 s = charset.header_encode(header)
0350             # Don't add more folding whitespace than necessary
0351             if chunks and chunks[-1].endswith(' '):
0352                 extra = ''
0353             else:
0354                 extra = ' '
0355             _max_append(chunks, s, maxlinelen, extra)
0356         joiner = NL + self._continuation_ws
0357         return joiner.join(chunks)
0358 
0359     def encode(self, splitchars=';, '):
0360         """Encode a message header into an RFC-compliant format.
0361 
0362         There are many issues involved in converting a given string for use in
0363         an email header.  Only certain character sets are readable in most
0364         email clients, and as header strings can only contain a subset of
0365         7-bit ASCII, care must be taken to properly convert and encode (with
0366         Base64 or quoted-printable) header strings.  In addition, there is a
0367         75-character length limit on any given encoded header field, so
0368         line-wrapping must be performed, even with double-byte character sets.
0369 
0370         This method will do its best to convert the string to the correct
0371         character set used in email, and encode and line wrap it safely with
0372         the appropriate scheme for that character set.
0373 
0374         If the given charset is not known or an error occurs during
0375         conversion, this function will return the header untouched.
0376 
0377         Optional splitchars is a string containing characters to split long
0378         ASCII lines on, in rough support of RFC 2822's `highest level
0379         syntactic breaks'.  This doesn't affect RFC 2047 encoded lines.
0380         """
0381         newchunks = []
0382         maxlinelen = self._firstlinelen
0383         lastlen = 0
0384         for s, charset in self._chunks:
0385             # The first bit of the next chunk should be just long enough to
0386             # fill the next line.  Don't forget the space separating the
0387             # encoded words.
0388             targetlen = maxlinelen - lastlen - 1
0389             if targetlen < charset.encoded_header_len(''):
0390                 # Stick it on the next line
0391                 targetlen = maxlinelen
0392             newchunks += self._split(s, charset, targetlen, splitchars)
0393             lastchunk, lastcharset = newchunks[-1]
0394             lastlen = lastcharset.encoded_header_len(lastchunk)
0395         return self._encode_chunks(newchunks, maxlinelen)
0396 
0397 
0398 
0399 def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
0400     lines = []
0401     maxlen = firstlen
0402     for line in s.splitlines():
0403         # Ignore any leading whitespace (i.e. continuation whitespace) already
0404         # on the line, since we'll be adding our own.
0405         line = line.lstrip()
0406         if len(line) < maxlen:
0407             lines.append(line)
0408             maxlen = restlen
0409             continue
0410         # Attempt to split the line at the highest-level syntactic break
0411         # possible.  Note that we don't have a lot of smarts about field
0412         # syntax; we just try to break on semi-colons, then commas, then
0413         # whitespace.
0414         for ch in splitchars:
0415             if ch in line:
0416                 break
0417         else:
0418             # There's nothing useful to split the line on, not even spaces, so
0419             # just append this line unchanged
0420             lines.append(line)
0421             maxlen = restlen
0422             continue
0423         # Now split the line on the character plus trailing whitespace
0424         cre = re.compile(r'%s\s*' % ch)
0425         if ch in ';,':
0426             eol = ch
0427         else:
0428             eol = ''
0429         joiner = eol + ' '
0430         joinlen = len(joiner)
0431         wslen = len(continuation_ws.replace('\t', SPACE8))
0432         this = []
0433         linelen = 0
0434         for part in cre.split(line):
0435             curlen = linelen + max(0, len(this)-1) * joinlen
0436             partlen = len(part)
0437             onfirstline = not lines
0438             # We don't want to split after the field name, if we're on the
0439             # first line and the field name is present in the header string.
0440             if ch == ' ' and onfirstline and \
0441                    len(this) == 1 and fcre.match(this[0]):
0442                 this.append(part)
0443                 linelen += partlen
0444             elif curlen + partlen > maxlen:
0445                 if this:
0446                     lines.append(joiner.join(this) + eol)
0447                 # If this part is longer than maxlen and we aren't already
0448                 # splitting on whitespace, try to recursively split this line
0449                 # on whitespace.
0450                 if partlen > maxlen and ch <> ' ':
0451                     subl = _split_ascii(part, maxlen, restlen,
0452                                         continuation_ws, ' ')
0453                     lines.extend(subl[:-1])
0454                     this = [subl[-1]]
0455                 else:
0456                     this = [part]
0457                 linelen = wslen + len(this[-1])
0458                 maxlen = restlen
0459             else:
0460                 this.append(part)
0461                 linelen += partlen
0462         # Put any left over parts on a line by themselves
0463         if this:
0464             lines.append(joiner.join(this))
0465     return lines
0466 
0467 
0468 
0469 def _binsplit(splittable, charset, maxlinelen):
0470     i = 0
0471     j = len(splittable)
0472     while i < j:
0473         # Invariants:
0474         # 1. splittable[:k] fits for all k <= i (note that we *assume*,
0475         #    at the start, that splittable[:0] fits).
0476         # 2. splittable[:k] does not fit for any k > j (at the start,
0477         #    this means we shouldn't look at any k > len(splittable)).
0478         # 3. We don't know about splittable[:k] for k in i+1..j.
0479         # 4. We want to set i to the largest k that fits, with i <= k <= j.
0480         #
0481         m = (i+j+1) >> 1  # ceiling((i+j)/2); i < m <= j
0482         chunk = charset.from_splittable(splittable[:m], True)
0483         chunklen = charset.encoded_header_len(chunk)
0484         if chunklen <= maxlinelen:
0485             # m is acceptable, so is a new lower bound.
0486             i = m
0487         else:
0488             # m is not acceptable, so final i must be < m.
0489             j = m - 1
0490     # i == j.  Invariant #1 implies that splittable[:i] fits, and
0491     # invariant #2 implies that splittable[:i+1] does not fit, so i
0492     # is what we're looking for.
0493     first = charset.from_splittable(splittable[:i], False)
0494     last  = charset.from_splittable(splittable[i:], False)
0495     return first, last
0496
Generated by PyXR 0.9.4