PyXR

c:\python24\lib \ encodings \ idna.py


0001 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
0002 
0003 import stringprep, unicodedata, re, codecs
0004 
0005 # IDNA section 3.1
0006 dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
0007 
0008 # IDNA section 5
0009 ace_prefix = "xn--"
0010 uace_prefix = unicode(ace_prefix, "ascii")
0011 
0012 # This assumes query strings, so AllowUnassigned is true
0013 def nameprep(label):
0014     # Map
0015     newlabel = []
0016     for c in label:
0017         if stringprep.in_table_b1(c):
0018             # Map to nothing
0019             continue
0020         newlabel.append(stringprep.map_table_b2(c))
0021     label = u"".join(newlabel)
0022 
0023     # Normalize
0024     label = unicodedata.normalize("NFKC", label)
0025 
0026     # Prohibit
0027     for c in label:
0028         if stringprep.in_table_c12(c) or \
0029            stringprep.in_table_c22(c) or \
0030            stringprep.in_table_c3(c) or \
0031            stringprep.in_table_c4(c) or \
0032            stringprep.in_table_c5(c) or \
0033            stringprep.in_table_c6(c) or \
0034            stringprep.in_table_c7(c) or \
0035            stringprep.in_table_c8(c) or \
0036            stringprep.in_table_c9(c):
0037             raise UnicodeError, "Invalid character %s" % repr(c)
0038 
0039     # Check bidi
0040     RandAL = map(stringprep.in_table_d1, label)
0041     for c in RandAL:
0042         if c:
0043             # There is a RandAL char in the string. Must perform further
0044             # tests:
0045             # 1) The characters in section 5.8 MUST be prohibited.
0046             # This is table C.8, which was already checked
0047             # 2) If a string contains any RandALCat character, the string
0048             # MUST NOT contain any LCat character.
0049             if filter(stringprep.in_table_d2, label):
0050                 raise UnicodeError, "Violation of BIDI requirement 2"
0051 
0052             # 3) If a string contains any RandALCat character, a
0053             # RandALCat character MUST be the first character of the
0054             # string, and a RandALCat character MUST be the last
0055             # character of the string.
0056             if not RandAL[0] or not RandAL[-1]:
0057                 raise UnicodeError, "Violation of BIDI requirement 3"
0058 
0059     return label
0060 
0061 def ToASCII(label):
0062     try:
0063         # Step 1: try ASCII
0064         label = label.encode("ascii")
0065     except UnicodeError:
0066         pass
0067     else:
0068         # Skip to step 3: UseSTD3ASCIIRules is false, so
0069         # Skip to step 8.
0070         if 0 < len(label) < 64:
0071             return label
0072         raise UnicodeError, "label too long"
0073 
0074     # Step 2: nameprep
0075     label = nameprep(label)
0076 
0077     # Step 3: UseSTD3ASCIIRules is false
0078     # Step 4: try ASCII
0079     try:
0080         label = label.encode("ascii")
0081     except UnicodeError:
0082         pass
0083     else:
0084         # Skip to step 8.
0085         if 0 < len(label) < 64:
0086             return label
0087         raise UnicodeError, "label too long"
0088 
0089     # Step 5: Check ACE prefix
0090     if label.startswith(uace_prefix):
0091         raise UnicodeError, "Label starts with ACE prefix"
0092 
0093     # Step 6: Encode with PUNYCODE
0094     label = label.encode("punycode")
0095 
0096     # Step 7: Prepend ACE prefix
0097     label = ace_prefix + label
0098 
0099     # Step 8: Check size
0100     if 0 < len(label) < 64:
0101         return label
0102     raise UnicodeError, "label too long"
0103 
0104 def ToUnicode(label):
0105     # Step 1: Check for ASCII
0106     if isinstance(label, str):
0107         pure_ascii = True
0108     else:
0109         try:
0110             label = label.encode("ascii")
0111             pure_ascii = True
0112         except UnicodeError:
0113             pure_ascii = False
0114     if not pure_ascii:
0115         # Step 2: Perform nameprep
0116         label = nameprep(label)
0117         # It doesn't say this, but apparently, it should be ASCII now
0118         try:
0119             label = label.encode("ascii")
0120         except UnicodeError:
0121             raise UnicodeError, "Invalid character in IDN label"
0122     # Step 3: Check for ACE prefix
0123     if not label.startswith(ace_prefix):
0124         return unicode(label, "ascii")
0125 
0126     # Step 4: Remove ACE prefix
0127     label1 = label[len(ace_prefix):]
0128 
0129     # Step 5: Decode using PUNYCODE
0130     result = label1.decode("punycode")
0131 
0132     # Step 6: Apply ToASCII
0133     label2 = ToASCII(result)
0134 
0135     # Step 7: Compare the result of step 6 with the one of step 3
0136     # label2 will already be in lower case.
0137     if label.lower() != label2:
0138         raise UnicodeError, ("IDNA does not round-trip", label, label2)
0139 
0140     # Step 8: return the result of step 5
0141     return result
0142 
0143 ### Codec APIs
0144 
0145 class Codec(codecs.Codec):
0146     def encode(self,input,errors='strict'):
0147 
0148         if errors != 'strict':
0149             # IDNA is quite clear that implementations must be strict
0150             raise UnicodeError, "unsupported error handling "+errors
0151 
0152         result = []
0153         labels = dots.split(input)
0154         if labels and len(labels[-1])==0:
0155             trailing_dot = '.'
0156             del labels[-1]
0157         else:
0158             trailing_dot = ''
0159         for label in labels:
0160             result.append(ToASCII(label))
0161         # Join with U+002E
0162         return ".".join(result)+trailing_dot, len(input)
0163 
0164     def decode(self,input,errors='strict'):
0165 
0166         if errors != 'strict':
0167             raise UnicodeError, "Unsupported error handling "+errors
0168 
0169         # IDNA allows decoding to operate on Unicode strings, too.
0170         if isinstance(input, unicode):
0171             labels = dots.split(input)
0172         else:
0173             # Must be ASCII string
0174             input = str(input)
0175             unicode(input, "ascii")
0176             labels = input.split(".")
0177 
0178         if labels and len(labels[-1]) == 0:
0179             trailing_dot = u'.'
0180             del labels[-1]
0181         else:
0182             trailing_dot = u''
0183 
0184         result = []
0185         for label in labels:
0186             result.append(ToUnicode(label))
0187 
0188         return u".".join(result)+trailing_dot, len(input)
0189 
0190 class StreamWriter(Codec,codecs.StreamWriter):
0191     pass
0192 
0193 class StreamReader(Codec,codecs.StreamReader):
0194     pass
0195 
0196 ### encodings module API
0197 
0198 def getregentry():
0199 
0200     return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
0201
Generated by PyXR 0.9.4