0001 # -*- coding: iso-8859-1 -*- 0002 """ Codec for the Punicode encoding, as specified in RFC 3492 0003 0004 Written by Martin v. Löwis. 0005 """ 0006 0007 import codecs 0008 0009 ##################### Encoding ##################################### 0010 0011 def segregate(str): 0012 """3.1 Basic code point segregation""" 0013 base = [] 0014 extended = {} 0015 for c in str: 0016 if ord(c) < 128: 0017 base.append(c) 0018 else: 0019 extended[c] = 1 0020 extended = extended.keys() 0021 extended.sort() 0022 return "".join(base).encode("ascii"),extended 0023 0024 def selective_len(str, max): 0025 """Return the length of str, considering only characters below max.""" 0026 res = 0 0027 for c in str: 0028 if ord(c) < max: 0029 res += 1 0030 return res 0031 0032 def selective_find(str, char, index, pos): 0033 """Return a pair (index, pos), indicating the next occurrence of 0034 char in str. index is the position of the character considering 0035 only ordinals up to and including char, and pos is the position in 0036 the full string. index/pos is the starting position in the full 0037 string.""" 0038 0039 l = len(str) 0040 while 1: 0041 pos += 1 0042 if pos == l: 0043 return (-1, -1) 0044 c = str[pos] 0045 if c == char: 0046 return index+1, pos 0047 elif c < char: 0048 index += 1 0049 0050 def insertion_unsort(str, extended): 0051 """3.2 Insertion unsort coding""" 0052 oldchar = 0x80 0053 result = [] 0054 oldindex = -1 0055 for c in extended: 0056 index = pos = -1 0057 char = ord(c) 0058 curlen = selective_len(str, char) 0059 delta = (curlen+1) * (char - oldchar) 0060 while 1: 0061 index,pos = selective_find(str,c,index,pos) 0062 if index == -1: 0063 break 0064 delta += index - oldindex 0065 result.append(delta-1) 0066 oldindex = index 0067 delta = 0 0068 oldchar = char 0069 0070 return result 0071 0072 def T(j, bias): 0073 # Punycode parameters: tmin = 1, tmax = 26, base = 36 0074 res = 36 * (j + 1) - bias 0075 if res < 1: return 1 0076 if res > 26: return 26 0077 return res 0078 0079 digits = "abcdefghijklmnopqrstuvwxyz0123456789" 0080 def generate_generalized_integer(N, bias): 0081 """3.3 Generalized variable-length integers""" 0082 result = [] 0083 j = 0 0084 while 1: 0085 t = T(j, bias) 0086 if N < t: 0087 result.append(digits[N]) 0088 return result 0089 result.append(digits[t + ((N - t) % (36 - t))]) 0090 N = (N - t) // (36 - t) 0091 j += 1 0092 0093 def adapt(delta, first, numchars): 0094 if first: 0095 delta //= 700 0096 else: 0097 delta //= 2 0098 delta += delta // numchars 0099 # ((base - tmin) * tmax) // 2 == 455 0100 divisions = 0 0101 while delta > 455: 0102 delta = delta // 35 # base - tmin 0103 divisions += 36 0104 bias = divisions + (36 * delta // (delta + 38)) 0105 return bias 0106 0107 0108 def generate_integers(baselen, deltas): 0109 """3.4 Bias adaptation""" 0110 # Punycode parameters: initial bias = 72, damp = 700, skew = 38 0111 result = [] 0112 bias = 72 0113 for points, delta in enumerate(deltas): 0114 s = generate_generalized_integer(delta, bias) 0115 result.extend(s) 0116 bias = adapt(delta, points==0, baselen+points+1) 0117 return "".join(result) 0118 0119 def punycode_encode(text): 0120 base, extended = segregate(text) 0121 base = base.encode("ascii") 0122 deltas = insertion_unsort(text, extended) 0123 extended = generate_integers(len(base), deltas) 0124 if base: 0125 return base + "-" + extended 0126 return extended 0127 0128 ##################### Decoding ##################################### 0129 0130 def decode_generalized_number(extended, extpos, bias, errors): 0131 """3.3 Generalized variable-length integers""" 0132 result = 0 0133 w = 1 0134 j = 0 0135 while 1: 0136 try: 0137 char = ord(extended[extpos]) 0138 except IndexError: 0139 if errors == "strict": 0140 raise UnicodeError, "incomplete punicode string" 0141 return extpos + 1, None 0142 extpos += 1 0143 if 0x41 <= char <= 0x5A: # A-Z 0144 digit = char - 0x41 0145 elif 0x30 <= char <= 0x39: 0146 digit = char - 22 # 0x30-26 0147 elif errors == "strict": 0148 raise UnicodeError("Invalid extended code point '%s'" 0149 % extended[extpos]) 0150 else: 0151 return extpos, None 0152 t = T(j, bias) 0153 result += digit * w 0154 if digit < t: 0155 return extpos, result 0156 w = w * (36 - t) 0157 j += 1 0158 0159 0160 def insertion_sort(base, extended, errors): 0161 """3.2 Insertion unsort coding""" 0162 char = 0x80 0163 pos = -1 0164 bias = 72 0165 extpos = 0 0166 while extpos < len(extended): 0167 newpos, delta = decode_generalized_number(extended, extpos, 0168 bias, errors) 0169 if delta is None: 0170 # There was an error in decoding. We can't continue because 0171 # synchronization is lost. 0172 return base 0173 pos += delta+1 0174 char += pos // (len(base) + 1) 0175 if char > 0x10FFFF: 0176 if errors == "strict": 0177 raise UnicodeError, ("Invalid character U+%x" % char) 0178 char = ord('?') 0179 pos = pos % (len(base) + 1) 0180 base = base[:pos] + unichr(char) + base[pos:] 0181 bias = adapt(delta, (extpos == 0), len(base)) 0182 extpos = newpos 0183 return base 0184 0185 def punycode_decode(text, errors): 0186 pos = text.rfind("-") 0187 if pos == -1: 0188 base = "" 0189 extended = text 0190 else: 0191 base = text[:pos] 0192 extended = text[pos+1:] 0193 base = unicode(base, "ascii", errors) 0194 extended = extended.upper() 0195 return insertion_sort(base, extended, errors) 0196 0197 ### Codec APIs 0198 0199 class Codec(codecs.Codec): 0200 def encode(self,input,errors='strict'): 0201 0202 res = punycode_encode(input) 0203 return res, len(input) 0204 0205 def decode(self,input,errors='strict'): 0206 0207 if errors not in ('strict', 'replace', 'ignore'): 0208 raise UnicodeError, "Unsupported error handling "+errors 0209 res = punycode_decode(input, errors) 0210 return res, len(input) 0211 0212 class StreamWriter(Codec,codecs.StreamWriter): 0213 pass 0214 0215 class StreamReader(Codec,codecs.StreamReader): 0216 pass 0217 0218 ### encodings module API 0219 0220 def getregentry(): 0221 0222 return (Codec().encode,Codec().decode,StreamReader,StreamWriter) 0223
Generated by PyXR 0.9.4