PyXR

c:\python24\lib \ encodings \ punycode.py



0001 # -*- coding: iso-8859-1 -*-
0002 """ Codec for the Punicode encoding, as specified in RFC 3492
0003 
0004 Written by Martin v. Löwis.
0005 """
0006 
0007 import codecs
0008 
0009 ##################### Encoding #####################################
0010 
0011 def segregate(str):
0012     """3.1 Basic code point segregation"""
0013     base = []
0014     extended = {}
0015     for c in str:
0016         if ord(c) < 128:
0017             base.append(c)
0018         else:
0019             extended[c] = 1
0020     extended = extended.keys()
0021     extended.sort()
0022     return "".join(base).encode("ascii"),extended
0023 
0024 def selective_len(str, max):
0025     """Return the length of str, considering only characters below max."""
0026     res = 0
0027     for c in str:
0028         if ord(c) < max:
0029             res += 1
0030     return res
0031 
0032 def selective_find(str, char, index, pos):
0033     """Return a pair (index, pos), indicating the next occurrence of
0034     char in str. index is the position of the character considering
0035     only ordinals up to and including char, and pos is the position in
0036     the full string. index/pos is the starting position in the full
0037     string."""
0038 
0039     l = len(str)
0040     while 1:
0041         pos += 1
0042         if pos == l:
0043             return (-1, -1)
0044         c = str[pos]
0045         if c == char:
0046             return index+1, pos
0047         elif c < char:
0048             index += 1
0049 
0050 def insertion_unsort(str, extended):
0051     """3.2 Insertion unsort coding"""
0052     oldchar = 0x80
0053     result = []
0054     oldindex = -1
0055     for c in extended:
0056         index = pos = -1
0057         char = ord(c)
0058         curlen = selective_len(str, char)
0059         delta = (curlen+1) * (char - oldchar)
0060         while 1:
0061             index,pos = selective_find(str,c,index,pos)
0062             if index == -1:
0063                 break
0064             delta += index - oldindex
0065             result.append(delta-1)
0066             oldindex = index
0067             delta = 0
0068         oldchar = char
0069 
0070     return result
0071 
0072 def T(j, bias):
0073     # Punycode parameters: tmin = 1, tmax = 26, base = 36
0074     res = 36 * (j + 1) - bias
0075     if res < 1: return 1
0076     if res > 26: return 26
0077     return res
0078 
0079 digits = "abcdefghijklmnopqrstuvwxyz0123456789"
0080 def generate_generalized_integer(N, bias):
0081     """3.3 Generalized variable-length integers"""
0082     result = []
0083     j = 0
0084     while 1:
0085         t = T(j, bias)
0086         if N < t:
0087             result.append(digits[N])
0088             return result
0089         result.append(digits[t + ((N - t) % (36 - t))])
0090         N = (N - t) // (36 - t)
0091         j += 1
0092 
0093 def adapt(delta, first, numchars):
0094     if first:
0095         delta //= 700
0096     else:
0097         delta //= 2
0098     delta += delta // numchars
0099     # ((base - tmin) * tmax) // 2 == 455
0100     divisions = 0
0101     while delta > 455:
0102         delta = delta // 35 # base - tmin
0103         divisions += 36
0104     bias = divisions + (36 * delta // (delta + 38))
0105     return bias
0106 
0107 
0108 def generate_integers(baselen, deltas):
0109     """3.4 Bias adaptation"""
0110     # Punycode parameters: initial bias = 72, damp = 700, skew = 38
0111     result = []
0112     bias = 72
0113     for points, delta in enumerate(deltas):
0114         s = generate_generalized_integer(delta, bias)
0115         result.extend(s)
0116         bias = adapt(delta, points==0, baselen+points+1)
0117     return "".join(result)
0118 
0119 def punycode_encode(text):
0120     base, extended = segregate(text)
0121     base = base.encode("ascii")
0122     deltas = insertion_unsort(text, extended)
0123     extended = generate_integers(len(base), deltas)
0124     if base:
0125         return base + "-" + extended
0126     return extended
0127 
0128 ##################### Decoding #####################################
0129 
0130 def decode_generalized_number(extended, extpos, bias, errors):
0131     """3.3 Generalized variable-length integers"""
0132     result = 0
0133     w = 1
0134     j = 0
0135     while 1:
0136         try:
0137             char = ord(extended[extpos])
0138         except IndexError:
0139             if errors == "strict":
0140                 raise UnicodeError, "incomplete punicode string"
0141             return extpos + 1, None
0142         extpos += 1
0143         if 0x41 <= char <= 0x5A: # A-Z
0144             digit = char - 0x41
0145         elif 0x30 <= char <= 0x39:
0146             digit = char - 22 # 0x30-26
0147         elif errors == "strict":
0148             raise UnicodeError("Invalid extended code point '%s'"
0149                                % extended[extpos])
0150         else:
0151             return extpos, None
0152         t = T(j, bias)
0153         result += digit * w
0154         if digit < t:
0155             return extpos, result
0156         w = w * (36 - t)
0157         j += 1
0158 
0159 
0160 def insertion_sort(base, extended, errors):
0161     """3.2 Insertion unsort coding"""
0162     char = 0x80
0163     pos = -1
0164     bias = 72
0165     extpos = 0
0166     while extpos < len(extended):
0167         newpos, delta = decode_generalized_number(extended, extpos,
0168                                                   bias, errors)
0169         if delta is None:
0170             # There was an error in decoding. We can't continue because
0171             # synchronization is lost.
0172             return base
0173         pos += delta+1
0174         char += pos // (len(base) + 1)
0175         if char > 0x10FFFF:
0176             if errors == "strict":
0177                 raise UnicodeError, ("Invalid character U+%x" % char)
0178             char = ord('?')
0179         pos = pos % (len(base) + 1)
0180         base = base[:pos] + unichr(char) + base[pos:]
0181         bias = adapt(delta, (extpos == 0), len(base))
0182         extpos = newpos
0183     return base
0184 
0185 def punycode_decode(text, errors):
0186     pos = text.rfind("-")
0187     if pos == -1:
0188         base = ""
0189         extended = text
0190     else:
0191         base = text[:pos]
0192         extended = text[pos+1:]
0193     base = unicode(base, "ascii", errors)
0194     extended = extended.upper()
0195     return insertion_sort(base, extended, errors)
0196 
0197 ### Codec APIs
0198 
0199 class Codec(codecs.Codec):
0200     def encode(self,input,errors='strict'):
0201 
0202         res = punycode_encode(input)
0203         return res, len(input)
0204 
0205     def decode(self,input,errors='strict'):
0206 
0207         if errors not in ('strict', 'replace', 'ignore'):
0208             raise UnicodeError, "Unsupported error handling "+errors
0209         res = punycode_decode(input, errors)
0210         return res, len(input)
0211 
0212 class StreamWriter(Codec,codecs.StreamWriter):
0213     pass
0214 
0215 class StreamReader(Codec,codecs.StreamReader):
0216     pass
0217 
0218 ### encodings module API
0219 
0220 def getregentry():
0221 
0222     return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
0223 

Generated by PyXR 0.9.4
SourceForge.net Logo