0001 # Copyright (C) 2001-2004 Python Software Foundation 0002 # Author: Ben Gertzfield, Barry Warsaw 0003 # Contact: email-sig@python.org 0004 0005 import email.base64MIME 0006 import email.quopriMIME 0007 from email.Encoders import encode_7or8bit 0008 0009 0010 0011 # Flags for types of header encodings 0012 QP = 1 # Quoted-Printable 0013 BASE64 = 2 # Base64 0014 SHORTEST = 3 # the shorter of QP and base64, but only for headers 0015 0016 # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7 0017 MISC_LEN = 7 0018 0019 DEFAULT_CHARSET = 'us-ascii' 0020 0021 0022 0023 # Defaults 0024 CHARSETS = { 0025 # input header enc body enc output conv 0026 'iso-8859-1': (QP, QP, None), 0027 'iso-8859-2': (QP, QP, None), 0028 'iso-8859-3': (QP, QP, None), 0029 'iso-8859-4': (QP, QP, None), 0030 # iso-8859-5 is Cyrillic, and not especially used 0031 # iso-8859-6 is Arabic, also not particularly used 0032 # iso-8859-7 is Greek, QP will not make it readable 0033 # iso-8859-8 is Hebrew, QP will not make it readable 0034 'iso-8859-9': (QP, QP, None), 0035 'iso-8859-10': (QP, QP, None), 0036 # iso-8859-11 is Thai, QP will not make it readable 0037 'iso-8859-13': (QP, QP, None), 0038 'iso-8859-14': (QP, QP, None), 0039 'iso-8859-15': (QP, QP, None), 0040 'windows-1252':(QP, QP, None), 0041 'viscii': (QP, QP, None), 0042 'us-ascii': (None, None, None), 0043 'big5': (BASE64, BASE64, None), 0044 'gb2312': (BASE64, BASE64, None), 0045 'euc-jp': (BASE64, None, 'iso-2022-jp'), 0046 'shift_jis': (BASE64, None, 'iso-2022-jp'), 0047 'iso-2022-jp': (BASE64, None, None), 0048 'koi8-r': (BASE64, BASE64, None), 0049 'utf-8': (SHORTEST, BASE64, 'utf-8'), 0050 # We're making this one up to represent raw unencoded 8-bit 0051 '8bit': (None, BASE64, 'utf-8'), 0052 } 0053 0054 # Aliases for other commonly-used names for character sets. Map 0055 # them to the real ones used in email. 0056 ALIASES = { 0057 'latin_1': 'iso-8859-1', 0058 'latin-1': 'iso-8859-1', 0059 'latin_2': 'iso-8859-2', 0060 'latin-2': 'iso-8859-2', 0061 'latin_3': 'iso-8859-3', 0062 'latin-3': 'iso-8859-3', 0063 'latin_4': 'iso-8859-4', 0064 'latin-4': 'iso-8859-4', 0065 'latin_5': 'iso-8859-9', 0066 'latin-5': 'iso-8859-9', 0067 'latin_6': 'iso-8859-10', 0068 'latin-6': 'iso-8859-10', 0069 'latin_7': 'iso-8859-13', 0070 'latin-7': 'iso-8859-13', 0071 'latin_8': 'iso-8859-14', 0072 'latin-8': 'iso-8859-14', 0073 'latin_9': 'iso-8859-15', 0074 'latin-9': 'iso-8859-15', 0075 'cp949': 'ks_c_5601-1987', 0076 'euc_jp': 'euc-jp', 0077 'euc_kr': 'euc-kr', 0078 'ascii': 'us-ascii', 0079 } 0080 0081 0082 # Map charsets to their Unicode codec strings. 0083 CODEC_MAP = { 0084 'gb2312': 'eucgb2312_cn', 0085 'big5': 'big5_tw', 0086 # Hack: We don't want *any* conversion for stuff marked us-ascii, as all 0087 # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. 0088 # Let that stuff pass through without conversion to/from Unicode. 0089 'us-ascii': None, 0090 } 0091 0092 0093 0094 # Convenience functions for extending the above mappings 0095 def add_charset(charset, header_enc=None, body_enc=None, output_charset=None): 0096 """Add character set properties to the global registry. 0097 0098 charset is the input character set, and must be the canonical name of a 0099 character set. 0100 0101 Optional header_enc and body_enc is either Charset.QP for 0102 quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for 0103 the shortest of qp or base64 encoding, or None for no encoding. SHORTEST 0104 is only valid for header_enc. It describes how message headers and 0105 message bodies in the input charset are to be encoded. Default is no 0106 encoding. 0107 0108 Optional output_charset is the character set that the output should be 0109 in. Conversions will proceed from input charset, to Unicode, to the 0110 output charset when the method Charset.convert() is called. The default 0111 is to output in the same character set as the input. 0112 0113 Both input_charset and output_charset must have Unicode codec entries in 0114 the module's charset-to-codec mapping; use add_codec(charset, codecname) 0115 to add codecs the module does not know about. See the codecs module's 0116 documentation for more information. 0117 """ 0118 if body_enc == SHORTEST: 0119 raise ValueError('SHORTEST not allowed for body_enc') 0120 CHARSETS[charset] = (header_enc, body_enc, output_charset) 0121 0122 0123 def add_alias(alias, canonical): 0124 """Add a character set alias. 0125 0126 alias is the alias name, e.g. latin-1 0127 canonical is the character set's canonical name, e.g. iso-8859-1 0128 """ 0129 ALIASES[alias] = canonical 0130 0131 0132 def add_codec(charset, codecname): 0133 """Add a codec that map characters in the given charset to/from Unicode. 0134 0135 charset is the canonical name of a character set. codecname is the name 0136 of a Python codec, as appropriate for the second argument to the unicode() 0137 built-in, or to the encode() method of a Unicode string. 0138 """ 0139 CODEC_MAP[charset] = codecname 0140 0141 0142 0143 class Charset: 0144 """Map character sets to their email properties. 0145 0146 This class provides information about the requirements imposed on email 0147 for a specific character set. It also provides convenience routines for 0148 converting between character sets, given the availability of the 0149 applicable codecs. Given a character set, it will do its best to provide 0150 information on how to use that character set in an email in an 0151 RFC-compliant way. 0152 0153 Certain character sets must be encoded with quoted-printable or base64 0154 when used in email headers or bodies. Certain character sets must be 0155 converted outright, and are not allowed in email. Instances of this 0156 module expose the following information about a character set: 0157 0158 input_charset: The initial character set specified. Common aliases 0159 are converted to their `official' email names (e.g. latin_1 0160 is converted to iso-8859-1). Defaults to 7-bit us-ascii. 0161 0162 header_encoding: If the character set must be encoded before it can be 0163 used in an email header, this attribute will be set to 0164 Charset.QP (for quoted-printable), Charset.BASE64 (for 0165 base64 encoding), or Charset.SHORTEST for the shortest of 0166 QP or BASE64 encoding. Otherwise, it will be None. 0167 0168 body_encoding: Same as header_encoding, but describes the encoding for the 0169 mail message's body, which indeed may be different than the 0170 header encoding. Charset.SHORTEST is not allowed for 0171 body_encoding. 0172 0173 output_charset: Some character sets must be converted before the can be 0174 used in email headers or bodies. If the input_charset is 0175 one of them, this attribute will contain the name of the 0176 charset output will be converted to. Otherwise, it will 0177 be None. 0178 0179 input_codec: The name of the Python codec used to convert the 0180 input_charset to Unicode. If no conversion codec is 0181 necessary, this attribute will be None. 0182 0183 output_codec: The name of the Python codec used to convert Unicode 0184 to the output_charset. If no conversion codec is necessary, 0185 this attribute will have the same value as the input_codec. 0186 """ 0187 def __init__(self, input_charset=DEFAULT_CHARSET): 0188 # RFC 2046, $4.1.2 says charsets are not case sensitive. We coerce to 0189 # unicode because its .lower() is locale insensitive. 0190 input_charset = unicode(input_charset, 'ascii').lower() 0191 # Set the input charset after filtering through the aliases 0192 self.input_charset = ALIASES.get(input_charset, input_charset) 0193 # We can try to guess which encoding and conversion to use by the 0194 # charset_map dictionary. Try that first, but let the user override 0195 # it. 0196 henc, benc, conv = CHARSETS.get(self.input_charset, 0197 (SHORTEST, BASE64, None)) 0198 if not conv: 0199 conv = self.input_charset 0200 # Set the attributes, allowing the arguments to override the default. 0201 self.header_encoding = henc 0202 self.body_encoding = benc 0203 self.output_charset = ALIASES.get(conv, conv) 0204 # Now set the codecs. If one isn't defined for input_charset, 0205 # guess and try a Unicode codec with the same name as input_codec. 0206 self.input_codec = CODEC_MAP.get(self.input_charset, 0207 self.input_charset) 0208 self.output_codec = CODEC_MAP.get(self.output_charset, 0209 self.output_charset) 0210 0211 def __str__(self): 0212 return self.input_charset.lower() 0213 0214 __repr__ = __str__ 0215 0216 def __eq__(self, other): 0217 return str(self) == str(other).lower() 0218 0219 def __ne__(self, other): 0220 return not self.__eq__(other) 0221 0222 def get_body_encoding(self): 0223 """Return the content-transfer-encoding used for body encoding. 0224 0225 This is either the string `quoted-printable' or `base64' depending on 0226 the encoding used, or it is a function in which case you should call 0227 the function with a single argument, the Message object being 0228 encoded. The function should then set the Content-Transfer-Encoding 0229 header itself to whatever is appropriate. 0230 0231 Returns "quoted-printable" if self.body_encoding is QP. 0232 Returns "base64" if self.body_encoding is BASE64. 0233 Returns "7bit" otherwise. 0234 """ 0235 assert self.body_encoding <> SHORTEST 0236 if self.body_encoding == QP: 0237 return 'quoted-printable' 0238 elif self.body_encoding == BASE64: 0239 return 'base64' 0240 else: 0241 return encode_7or8bit 0242 0243 def convert(self, s): 0244 """Convert a string from the input_codec to the output_codec.""" 0245 if self.input_codec <> self.output_codec: 0246 return unicode(s, self.input_codec).encode(self.output_codec) 0247 else: 0248 return s 0249 0250 def to_splittable(self, s): 0251 """Convert a possibly multibyte string to a safely splittable format. 0252 0253 Uses the input_codec to try and convert the string to Unicode, so it 0254 can be safely split on character boundaries (even for multibyte 0255 characters). 0256 0257 Returns the string as-is if it isn't known how to convert it to 0258 Unicode with the input_charset. 0259 0260 Characters that could not be converted to Unicode will be replaced 0261 with the Unicode replacement character U+FFFD. 0262 """ 0263 if isinstance(s, unicode) or self.input_codec is None: 0264 return s 0265 try: 0266 return unicode(s, self.input_codec, 'replace') 0267 except LookupError: 0268 # Input codec not installed on system, so return the original 0269 # string unchanged. 0270 return s 0271 0272 def from_splittable(self, ustr, to_output=True): 0273 """Convert a splittable string back into an encoded string. 0274 0275 Uses the proper codec to try and convert the string from Unicode back 0276 into an encoded format. Return the string as-is if it is not Unicode, 0277 or if it could not be converted from Unicode. 0278 0279 Characters that could not be converted from Unicode will be replaced 0280 with an appropriate character (usually '?'). 0281 0282 If to_output is True (the default), uses output_codec to convert to an 0283 encoded format. If to_output is False, uses input_codec. 0284 """ 0285 if to_output: 0286 codec = self.output_codec 0287 else: 0288 codec = self.input_codec 0289 if not isinstance(ustr, unicode) or codec is None: 0290 return ustr 0291 try: 0292 return ustr.encode(codec, 'replace') 0293 except LookupError: 0294 # Output codec not installed 0295 return ustr 0296 0297 def get_output_charset(self): 0298 """Return the output character set. 0299 0300 This is self.output_charset if that is not None, otherwise it is 0301 self.input_charset. 0302 """ 0303 return self.output_charset or self.input_charset 0304 0305 def encoded_header_len(self, s): 0306 """Return the length of the encoded header string.""" 0307 cset = self.get_output_charset() 0308 # The len(s) of a 7bit encoding is len(s) 0309 if self.header_encoding == BASE64: 0310 return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN 0311 elif self.header_encoding == QP: 0312 return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN 0313 elif self.header_encoding == SHORTEST: 0314 lenb64 = email.base64MIME.base64_len(s) 0315 lenqp = email.quopriMIME.header_quopri_len(s) 0316 return min(lenb64, lenqp) + len(cset) + MISC_LEN 0317 else: 0318 return len(s) 0319 0320 def header_encode(self, s, convert=False): 0321 """Header-encode a string, optionally converting it to output_charset. 0322 0323 If convert is True, the string will be converted from the input 0324 charset to the output charset automatically. This is not useful for 0325 multibyte character sets, which have line length issues (multibyte 0326 characters must be split on a character, not a byte boundary); use the 0327 high-level Header class to deal with these issues. convert defaults 0328 to False. 0329 0330 The type of encoding (base64 or quoted-printable) will be based on 0331 self.header_encoding. 0332 """ 0333 cset = self.get_output_charset() 0334 if convert: 0335 s = self.convert(s) 0336 # 7bit/8bit encodings return the string unchanged (modulo conversions) 0337 if self.header_encoding == BASE64: 0338 return email.base64MIME.header_encode(s, cset) 0339 elif self.header_encoding == QP: 0340 return email.quopriMIME.header_encode(s, cset, maxlinelen=None) 0341 elif self.header_encoding == SHORTEST: 0342 lenb64 = email.base64MIME.base64_len(s) 0343 lenqp = email.quopriMIME.header_quopri_len(s) 0344 if lenb64 < lenqp: 0345 return email.base64MIME.header_encode(s, cset) 0346 else: 0347 return email.quopriMIME.header_encode(s, cset, maxlinelen=None) 0348 else: 0349 return s 0350 0351 def body_encode(self, s, convert=True): 0352 """Body-encode a string and convert it to output_charset. 0353 0354 If convert is True (the default), the string will be converted from 0355 the input charset to output charset automatically. Unlike 0356 header_encode(), there are no issues with byte boundaries and 0357 multibyte charsets in email bodies, so this is usually pretty safe. 0358 0359 The type of encoding (base64 or quoted-printable) will be based on 0360 self.body_encoding. 0361 """ 0362 if convert: 0363 s = self.convert(s) 0364 # 7bit/8bit encodings return the string unchanged (module conversions) 0365 if self.body_encoding is BASE64: 0366 return email.base64MIME.body_encode(s) 0367 elif self.body_encoding is QP: 0368 return email.quopriMIME.body_encode(s) 0369 else: 0370 return s 0371
Generated by PyXR 0.9.4