0001 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) 0002 0003 import stringprep, unicodedata, re, codecs 0004 0005 # IDNA section 3.1 0006 dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") 0007 0008 # IDNA section 5 0009 ace_prefix = "xn--" 0010 uace_prefix = unicode(ace_prefix, "ascii") 0011 0012 # This assumes query strings, so AllowUnassigned is true 0013 def nameprep(label): 0014 # Map 0015 newlabel = [] 0016 for c in label: 0017 if stringprep.in_table_b1(c): 0018 # Map to nothing 0019 continue 0020 newlabel.append(stringprep.map_table_b2(c)) 0021 label = u"".join(newlabel) 0022 0023 # Normalize 0024 label = unicodedata.normalize("NFKC", label) 0025 0026 # Prohibit 0027 for c in label: 0028 if stringprep.in_table_c12(c) or \ 0029 stringprep.in_table_c22(c) or \ 0030 stringprep.in_table_c3(c) or \ 0031 stringprep.in_table_c4(c) or \ 0032 stringprep.in_table_c5(c) or \ 0033 stringprep.in_table_c6(c) or \ 0034 stringprep.in_table_c7(c) or \ 0035 stringprep.in_table_c8(c) or \ 0036 stringprep.in_table_c9(c): 0037 raise UnicodeError, "Invalid character %s" % repr(c) 0038 0039 # Check bidi 0040 RandAL = map(stringprep.in_table_d1, label) 0041 for c in RandAL: 0042 if c: 0043 # There is a RandAL char in the string. Must perform further 0044 # tests: 0045 # 1) The characters in section 5.8 MUST be prohibited. 0046 # This is table C.8, which was already checked 0047 # 2) If a string contains any RandALCat character, the string 0048 # MUST NOT contain any LCat character. 0049 if filter(stringprep.in_table_d2, label): 0050 raise UnicodeError, "Violation of BIDI requirement 2" 0051 0052 # 3) If a string contains any RandALCat character, a 0053 # RandALCat character MUST be the first character of the 0054 # string, and a RandALCat character MUST be the last 0055 # character of the string. 0056 if not RandAL[0] or not RandAL[-1]: 0057 raise UnicodeError, "Violation of BIDI requirement 3" 0058 0059 return label 0060 0061 def ToASCII(label): 0062 try: 0063 # Step 1: try ASCII 0064 label = label.encode("ascii") 0065 except UnicodeError: 0066 pass 0067 else: 0068 # Skip to step 3: UseSTD3ASCIIRules is false, so 0069 # Skip to step 8. 0070 if 0 < len(label) < 64: 0071 return label 0072 raise UnicodeError, "label too long" 0073 0074 # Step 2: nameprep 0075 label = nameprep(label) 0076 0077 # Step 3: UseSTD3ASCIIRules is false 0078 # Step 4: try ASCII 0079 try: 0080 label = label.encode("ascii") 0081 except UnicodeError: 0082 pass 0083 else: 0084 # Skip to step 8. 0085 if 0 < len(label) < 64: 0086 return label 0087 raise UnicodeError, "label too long" 0088 0089 # Step 5: Check ACE prefix 0090 if label.startswith(uace_prefix): 0091 raise UnicodeError, "Label starts with ACE prefix" 0092 0093 # Step 6: Encode with PUNYCODE 0094 label = label.encode("punycode") 0095 0096 # Step 7: Prepend ACE prefix 0097 label = ace_prefix + label 0098 0099 # Step 8: Check size 0100 if 0 < len(label) < 64: 0101 return label 0102 raise UnicodeError, "label too long" 0103 0104 def ToUnicode(label): 0105 # Step 1: Check for ASCII 0106 if isinstance(label, str): 0107 pure_ascii = True 0108 else: 0109 try: 0110 label = label.encode("ascii") 0111 pure_ascii = True 0112 except UnicodeError: 0113 pure_ascii = False 0114 if not pure_ascii: 0115 # Step 2: Perform nameprep 0116 label = nameprep(label) 0117 # It doesn't say this, but apparently, it should be ASCII now 0118 try: 0119 label = label.encode("ascii") 0120 except UnicodeError: 0121 raise UnicodeError, "Invalid character in IDN label" 0122 # Step 3: Check for ACE prefix 0123 if not label.startswith(ace_prefix): 0124 return unicode(label, "ascii") 0125 0126 # Step 4: Remove ACE prefix 0127 label1 = label[len(ace_prefix):] 0128 0129 # Step 5: Decode using PUNYCODE 0130 result = label1.decode("punycode") 0131 0132 # Step 6: Apply ToASCII 0133 label2 = ToASCII(result) 0134 0135 # Step 7: Compare the result of step 6 with the one of step 3 0136 # label2 will already be in lower case. 0137 if label.lower() != label2: 0138 raise UnicodeError, ("IDNA does not round-trip", label, label2) 0139 0140 # Step 8: return the result of step 5 0141 return result 0142 0143 ### Codec APIs 0144 0145 class Codec(codecs.Codec): 0146 def encode(self,input,errors='strict'): 0147 0148 if errors != 'strict': 0149 # IDNA is quite clear that implementations must be strict 0150 raise UnicodeError, "unsupported error handling "+errors 0151 0152 result = [] 0153 labels = dots.split(input) 0154 if labels and len(labels[-1])==0: 0155 trailing_dot = '.' 0156 del labels[-1] 0157 else: 0158 trailing_dot = '' 0159 for label in labels: 0160 result.append(ToASCII(label)) 0161 # Join with U+002E 0162 return ".".join(result)+trailing_dot, len(input) 0163 0164 def decode(self,input,errors='strict'): 0165 0166 if errors != 'strict': 0167 raise UnicodeError, "Unsupported error handling "+errors 0168 0169 # IDNA allows decoding to operate on Unicode strings, too. 0170 if isinstance(input, unicode): 0171 labels = dots.split(input) 0172 else: 0173 # Must be ASCII string 0174 input = str(input) 0175 unicode(input, "ascii") 0176 labels = input.split(".") 0177 0178 if labels and len(labels[-1]) == 0: 0179 trailing_dot = u'.' 0180 del labels[-1] 0181 else: 0182 trailing_dot = u'' 0183 0184 result = [] 0185 for label in labels: 0186 result.append(ToUnicode(label)) 0187 0188 return u".".join(result)+trailing_dot, len(input) 0189 0190 class StreamWriter(Codec,codecs.StreamWriter): 0191 pass 0192 0193 class StreamReader(Codec,codecs.StreamReader): 0194 pass 0195 0196 ### encodings module API 0197 0198 def getregentry(): 0199 0200 return (Codec().encode,Codec().decode,StreamReader,StreamWriter) 0201
Generated by PyXR 0.9.4