PyXR

c:\python24\lib \ email \ Charset.py


0001 # Copyright (C) 2001-2004 Python Software Foundation
0002 # Author: Ben Gertzfield, Barry Warsaw
0003 # Contact: email-sig@python.org
0004 
0005 import email.base64MIME
0006 import email.quopriMIME
0007 from email.Encoders import encode_7or8bit
0008 
0009 
0010 
0011 # Flags for types of header encodings
0012 QP          = 1 # Quoted-Printable
0013 BASE64      = 2 # Base64
0014 SHORTEST    = 3 # the shorter of QP and base64, but only for headers
0015 
0016 # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
0017 MISC_LEN = 7
0018 
0019 DEFAULT_CHARSET = 'us-ascii'
0020 
0021 
0022 
0023 # Defaults
0024 CHARSETS = {
0025     # input        header enc  body enc output conv
0026     'iso-8859-1':  (QP,        QP,      None),
0027     'iso-8859-2':  (QP,        QP,      None),
0028     'iso-8859-3':  (QP,        QP,      None),
0029     'iso-8859-4':  (QP,        QP,      None),
0030     # iso-8859-5 is Cyrillic, and not especially used
0031     # iso-8859-6 is Arabic, also not particularly used
0032     # iso-8859-7 is Greek, QP will not make it readable
0033     # iso-8859-8 is Hebrew, QP will not make it readable
0034     'iso-8859-9':  (QP,        QP,      None),
0035     'iso-8859-10': (QP,        QP,      None),
0036     # iso-8859-11 is Thai, QP will not make it readable
0037     'iso-8859-13': (QP,        QP,      None),
0038     'iso-8859-14': (QP,        QP,      None),
0039     'iso-8859-15': (QP,        QP,      None),
0040     'windows-1252':(QP,        QP,      None),
0041     'viscii':      (QP,        QP,      None),
0042     'us-ascii':    (None,      None,    None),
0043     'big5':        (BASE64,    BASE64,  None),
0044     'gb2312':      (BASE64,    BASE64,  None),
0045     'euc-jp':      (BASE64,    None,    'iso-2022-jp'),
0046     'shift_jis':   (BASE64,    None,    'iso-2022-jp'),
0047     'iso-2022-jp': (BASE64,    None,    None),
0048     'koi8-r':      (BASE64,    BASE64,  None),
0049     'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
0050     # We're making this one up to represent raw unencoded 8-bit
0051     '8bit':        (None,      BASE64, 'utf-8'),
0052     }
0053 
0054 # Aliases for other commonly-used names for character sets.  Map
0055 # them to the real ones used in email.
0056 ALIASES = {
0057     'latin_1': 'iso-8859-1',
0058     'latin-1': 'iso-8859-1',
0059     'latin_2': 'iso-8859-2',
0060     'latin-2': 'iso-8859-2',
0061     'latin_3': 'iso-8859-3',
0062     'latin-3': 'iso-8859-3',
0063     'latin_4': 'iso-8859-4',
0064     'latin-4': 'iso-8859-4',
0065     'latin_5': 'iso-8859-9',
0066     'latin-5': 'iso-8859-9',
0067     'latin_6': 'iso-8859-10',
0068     'latin-6': 'iso-8859-10',
0069     'latin_7': 'iso-8859-13',
0070     'latin-7': 'iso-8859-13',
0071     'latin_8': 'iso-8859-14',
0072     'latin-8': 'iso-8859-14',
0073     'latin_9': 'iso-8859-15',
0074     'latin-9': 'iso-8859-15',
0075     'cp949':   'ks_c_5601-1987',
0076     'euc_jp':  'euc-jp',
0077     'euc_kr':  'euc-kr',
0078     'ascii':   'us-ascii',
0079     }
0080 
0081 
0082 # Map charsets to their Unicode codec strings.
0083 CODEC_MAP = {
0084     'gb2312':      'eucgb2312_cn',
0085     'big5':        'big5_tw',
0086     # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
0087     # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
0088     # Let that stuff pass through without conversion to/from Unicode.
0089     'us-ascii':    None,
0090     }
0091 
0092 
0093 
0094 # Convenience functions for extending the above mappings
0095 def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
0096     """Add character set properties to the global registry.
0097 
0098     charset is the input character set, and must be the canonical name of a
0099     character set.
0100 
0101     Optional header_enc and body_enc is either Charset.QP for
0102     quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for
0103     the shortest of qp or base64 encoding, or None for no encoding.  SHORTEST
0104     is only valid for header_enc.  It describes how message headers and
0105     message bodies in the input charset are to be encoded.  Default is no
0106     encoding.
0107 
0108     Optional output_charset is the character set that the output should be
0109     in.  Conversions will proceed from input charset, to Unicode, to the
0110     output charset when the method Charset.convert() is called.  The default
0111     is to output in the same character set as the input.
0112 
0113     Both input_charset and output_charset must have Unicode codec entries in
0114     the module's charset-to-codec mapping; use add_codec(charset, codecname)
0115     to add codecs the module does not know about.  See the codecs module's
0116     documentation for more information.
0117     """
0118     if body_enc == SHORTEST:
0119         raise ValueError('SHORTEST not allowed for body_enc')
0120     CHARSETS[charset] = (header_enc, body_enc, output_charset)
0121 
0122 
0123 def add_alias(alias, canonical):
0124     """Add a character set alias.
0125 
0126     alias is the alias name, e.g. latin-1
0127     canonical is the character set's canonical name, e.g. iso-8859-1
0128     """
0129     ALIASES[alias] = canonical
0130 
0131 
0132 def add_codec(charset, codecname):
0133     """Add a codec that map characters in the given charset to/from Unicode.
0134 
0135     charset is the canonical name of a character set.  codecname is the name
0136     of a Python codec, as appropriate for the second argument to the unicode()
0137     built-in, or to the encode() method of a Unicode string.
0138     """
0139     CODEC_MAP[charset] = codecname
0140 
0141 
0142 
0143 class Charset:
0144     """Map character sets to their email properties.
0145 
0146     This class provides information about the requirements imposed on email
0147     for a specific character set.  It also provides convenience routines for
0148     converting between character sets, given the availability of the
0149     applicable codecs.  Given a character set, it will do its best to provide
0150     information on how to use that character set in an email in an
0151     RFC-compliant way.
0152 
0153     Certain character sets must be encoded with quoted-printable or base64
0154     when used in email headers or bodies.  Certain character sets must be
0155     converted outright, and are not allowed in email.  Instances of this
0156     module expose the following information about a character set:
0157 
0158     input_charset: The initial character set specified.  Common aliases
0159                    are converted to their `official' email names (e.g. latin_1
0160                    is converted to iso-8859-1).  Defaults to 7-bit us-ascii.
0161 
0162     header_encoding: If the character set must be encoded before it can be
0163                      used in an email header, this attribute will be set to
0164                      Charset.QP (for quoted-printable), Charset.BASE64 (for
0165                      base64 encoding), or Charset.SHORTEST for the shortest of
0166                      QP or BASE64 encoding.  Otherwise, it will be None.
0167 
0168     body_encoding: Same as header_encoding, but describes the encoding for the
0169                    mail message's body, which indeed may be different than the
0170                    header encoding.  Charset.SHORTEST is not allowed for
0171                    body_encoding.
0172 
0173     output_charset: Some character sets must be converted before the can be
0174                     used in email headers or bodies.  If the input_charset is
0175                     one of them, this attribute will contain the name of the
0176                     charset output will be converted to.  Otherwise, it will
0177                     be None.
0178 
0179     input_codec: The name of the Python codec used to convert the
0180                  input_charset to Unicode.  If no conversion codec is
0181                  necessary, this attribute will be None.
0182 
0183     output_codec: The name of the Python codec used to convert Unicode
0184                   to the output_charset.  If no conversion codec is necessary,
0185                   this attribute will have the same value as the input_codec.
0186     """
0187     def __init__(self, input_charset=DEFAULT_CHARSET):
0188         # RFC 2046, $4.1.2 says charsets are not case sensitive.  We coerce to
0189         # unicode because its .lower() is locale insensitive.
0190         input_charset = unicode(input_charset, 'ascii').lower()
0191         # Set the input charset after filtering through the aliases
0192         self.input_charset = ALIASES.get(input_charset, input_charset)
0193         # We can try to guess which encoding and conversion to use by the
0194         # charset_map dictionary.  Try that first, but let the user override
0195         # it.
0196         henc, benc, conv = CHARSETS.get(self.input_charset,
0197                                         (SHORTEST, BASE64, None))
0198         if not conv:
0199             conv = self.input_charset
0200         # Set the attributes, allowing the arguments to override the default.
0201         self.header_encoding = henc
0202         self.body_encoding = benc
0203         self.output_charset = ALIASES.get(conv, conv)
0204         # Now set the codecs.  If one isn't defined for input_charset,
0205         # guess and try a Unicode codec with the same name as input_codec.
0206         self.input_codec = CODEC_MAP.get(self.input_charset,
0207                                          self.input_charset)
0208         self.output_codec = CODEC_MAP.get(self.output_charset,
0209                                             self.output_charset)
0210 
0211     def __str__(self):
0212         return self.input_charset.lower()
0213 
0214     __repr__ = __str__
0215 
0216     def __eq__(self, other):
0217         return str(self) == str(other).lower()
0218 
0219     def __ne__(self, other):
0220         return not self.__eq__(other)
0221 
0222     def get_body_encoding(self):
0223         """Return the content-transfer-encoding used for body encoding.
0224 
0225         This is either the string `quoted-printable' or `base64' depending on
0226         the encoding used, or it is a function in which case you should call
0227         the function with a single argument, the Message object being
0228         encoded.  The function should then set the Content-Transfer-Encoding
0229         header itself to whatever is appropriate.
0230 
0231         Returns "quoted-printable" if self.body_encoding is QP.
0232         Returns "base64" if self.body_encoding is BASE64.
0233         Returns "7bit" otherwise.
0234         """
0235         assert self.body_encoding <> SHORTEST
0236         if self.body_encoding == QP:
0237             return 'quoted-printable'
0238         elif self.body_encoding == BASE64:
0239             return 'base64'
0240         else:
0241             return encode_7or8bit
0242 
0243     def convert(self, s):
0244         """Convert a string from the input_codec to the output_codec."""
0245         if self.input_codec <> self.output_codec:
0246             return unicode(s, self.input_codec).encode(self.output_codec)
0247         else:
0248             return s
0249 
0250     def to_splittable(self, s):
0251         """Convert a possibly multibyte string to a safely splittable format.
0252 
0253         Uses the input_codec to try and convert the string to Unicode, so it
0254         can be safely split on character boundaries (even for multibyte
0255         characters).
0256 
0257         Returns the string as-is if it isn't known how to convert it to
0258         Unicode with the input_charset.
0259 
0260         Characters that could not be converted to Unicode will be replaced
0261         with the Unicode replacement character U+FFFD.
0262         """
0263         if isinstance(s, unicode) or self.input_codec is None:
0264             return s
0265         try:
0266             return unicode(s, self.input_codec, 'replace')
0267         except LookupError:
0268             # Input codec not installed on system, so return the original
0269             # string unchanged.
0270             return s
0271 
0272     def from_splittable(self, ustr, to_output=True):
0273         """Convert a splittable string back into an encoded string.
0274 
0275         Uses the proper codec to try and convert the string from Unicode back
0276         into an encoded format.  Return the string as-is if it is not Unicode,
0277         or if it could not be converted from Unicode.
0278 
0279         Characters that could not be converted from Unicode will be replaced
0280         with an appropriate character (usually '?').
0281 
0282         If to_output is True (the default), uses output_codec to convert to an
0283         encoded format.  If to_output is False, uses input_codec.
0284         """
0285         if to_output:
0286             codec = self.output_codec
0287         else:
0288             codec = self.input_codec
0289         if not isinstance(ustr, unicode) or codec is None:
0290             return ustr
0291         try:
0292             return ustr.encode(codec, 'replace')
0293         except LookupError:
0294             # Output codec not installed
0295             return ustr
0296 
0297     def get_output_charset(self):
0298         """Return the output character set.
0299 
0300         This is self.output_charset if that is not None, otherwise it is
0301         self.input_charset.
0302         """
0303         return self.output_charset or self.input_charset
0304 
0305     def encoded_header_len(self, s):
0306         """Return the length of the encoded header string."""
0307         cset = self.get_output_charset()
0308         # The len(s) of a 7bit encoding is len(s)
0309         if self.header_encoding == BASE64:
0310             return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN
0311         elif self.header_encoding == QP:
0312             return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN
0313         elif self.header_encoding == SHORTEST:
0314             lenb64 = email.base64MIME.base64_len(s)
0315             lenqp = email.quopriMIME.header_quopri_len(s)
0316             return min(lenb64, lenqp) + len(cset) + MISC_LEN
0317         else:
0318             return len(s)
0319 
0320     def header_encode(self, s, convert=False):
0321         """Header-encode a string, optionally converting it to output_charset.
0322 
0323         If convert is True, the string will be converted from the input
0324         charset to the output charset automatically.  This is not useful for
0325         multibyte character sets, which have line length issues (multibyte
0326         characters must be split on a character, not a byte boundary); use the
0327         high-level Header class to deal with these issues.  convert defaults
0328         to False.
0329 
0330         The type of encoding (base64 or quoted-printable) will be based on
0331         self.header_encoding.
0332         """
0333         cset = self.get_output_charset()
0334         if convert:
0335             s = self.convert(s)
0336         # 7bit/8bit encodings return the string unchanged (modulo conversions)
0337         if self.header_encoding == BASE64:
0338             return email.base64MIME.header_encode(s, cset)
0339         elif self.header_encoding == QP:
0340             return email.quopriMIME.header_encode(s, cset, maxlinelen=None)
0341         elif self.header_encoding == SHORTEST:
0342             lenb64 = email.base64MIME.base64_len(s)
0343             lenqp = email.quopriMIME.header_quopri_len(s)
0344             if lenb64 < lenqp:
0345                 return email.base64MIME.header_encode(s, cset)
0346             else:
0347                 return email.quopriMIME.header_encode(s, cset, maxlinelen=None)
0348         else:
0349             return s
0350 
0351     def body_encode(self, s, convert=True):
0352         """Body-encode a string and convert it to output_charset.
0353 
0354         If convert is True (the default), the string will be converted from
0355         the input charset to output charset automatically.  Unlike
0356         header_encode(), there are no issues with byte boundaries and
0357         multibyte charsets in email bodies, so this is usually pretty safe.
0358 
0359         The type of encoding (base64 or quoted-printable) will be based on
0360         self.body_encoding.
0361         """
0362         if convert:
0363             s = self.convert(s)
0364         # 7bit/8bit encodings return the string unchanged (module conversions)
0365         if self.body_encoding is BASE64:
0366             return email.base64MIME.body_encode(s)
0367         elif self.body_encoding is QP:
0368             return email.quopriMIME.body_encode(s)
0369         else:
0370             return s
0371
Generated by PyXR 0.9.4