0001 from test import test_support 0002 import unittest 0003 import codecs 0004 import StringIO 0005 0006 class Queue(object): 0007 """ 0008 queue: write bytes at one end, read bytes from the other end 0009 """ 0010 def __init__(self): 0011 self._buffer = "" 0012 0013 def write(self, chars): 0014 self._buffer += chars 0015 0016 def read(self, size=-1): 0017 if size<0: 0018 s = self._buffer 0019 self._buffer = "" 0020 return s 0021 else: 0022 s = self._buffer[:size] 0023 self._buffer = self._buffer[size:] 0024 return s 0025 0026 class PartialReadTest(unittest.TestCase): 0027 def check_partial(self, encoding, input, partialresults): 0028 # get a StreamReader for the encoding and feed the bytestring version 0029 # of input to the reader byte by byte. Read every available from 0030 # the StreamReader and check that the results equal the appropriate 0031 # entries from partialresults. 0032 q = Queue() 0033 r = codecs.getreader(encoding)(q) 0034 result = u"" 0035 for (c, partialresult) in zip(input.encode(encoding), partialresults): 0036 q.write(c) 0037 result += r.read() 0038 self.assertEqual(result, partialresult) 0039 # check that there's nothing left in the buffers 0040 self.assertEqual(r.read(), u"") 0041 self.assertEqual(r.bytebuffer, "") 0042 self.assertEqual(r.charbuffer, u"") 0043 0044 class UTF16Test(PartialReadTest): 0045 0046 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' 0047 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' 0048 0049 def test_only_one_bom(self): 0050 _,_,reader,writer = codecs.lookup("utf-16") 0051 # encode some stream 0052 s = StringIO.StringIO() 0053 f = writer(s) 0054 f.write(u"spam") 0055 f.write(u"spam") 0056 d = s.getvalue() 0057 # check whether there is exactly one BOM in it 0058 self.assert_(d == self.spamle or d == self.spambe) 0059 # try to read it back 0060 s = StringIO.StringIO(d) 0061 f = reader(s) 0062 self.assertEquals(f.read(), u"spamspam") 0063 0064 def test_partial(self): 0065 self.check_partial( 0066 "utf-16", 0067 u"\x00\xff\u0100\uffff", 0068 [ 0069 u"", # first byte of BOM read 0070 u"", # second byte of BOM read => byteorder known 0071 u"", 0072 u"\x00", 0073 u"\x00", 0074 u"\x00\xff", 0075 u"\x00\xff", 0076 u"\x00\xff\u0100", 0077 u"\x00\xff\u0100", 0078 u"\x00\xff\u0100\uffff", 0079 ] 0080 ) 0081 0082 class UTF16LETest(PartialReadTest): 0083 0084 def test_partial(self): 0085 self.check_partial( 0086 "utf-16-le", 0087 u"\x00\xff\u0100\uffff", 0088 [ 0089 u"", 0090 u"\x00", 0091 u"\x00", 0092 u"\x00\xff", 0093 u"\x00\xff", 0094 u"\x00\xff\u0100", 0095 u"\x00\xff\u0100", 0096 u"\x00\xff\u0100\uffff", 0097 ] 0098 ) 0099 0100 class UTF16BETest(PartialReadTest): 0101 0102 def test_partial(self): 0103 self.check_partial( 0104 "utf-16-be", 0105 u"\x00\xff\u0100\uffff", 0106 [ 0107 u"", 0108 u"\x00", 0109 u"\x00", 0110 u"\x00\xff", 0111 u"\x00\xff", 0112 u"\x00\xff\u0100", 0113 u"\x00\xff\u0100", 0114 u"\x00\xff\u0100\uffff", 0115 ] 0116 ) 0117 0118 class UTF8Test(PartialReadTest): 0119 0120 def test_partial(self): 0121 self.check_partial( 0122 "utf-8", 0123 u"\x00\xff\u07ff\u0800\uffff", 0124 [ 0125 u"\x00", 0126 u"\x00", 0127 u"\x00\xff", 0128 u"\x00\xff", 0129 u"\x00\xff\u07ff", 0130 u"\x00\xff\u07ff", 0131 u"\x00\xff\u07ff", 0132 u"\x00\xff\u07ff\u0800", 0133 u"\x00\xff\u07ff\u0800", 0134 u"\x00\xff\u07ff\u0800", 0135 u"\x00\xff\u07ff\u0800\uffff", 0136 ] 0137 ) 0138 0139 class EscapeDecodeTest(unittest.TestCase): 0140 def test_empty_escape_decode(self): 0141 self.assertEquals(codecs.escape_decode(""), ("", 0)) 0142 0143 class RecodingTest(unittest.TestCase): 0144 def test_recoding(self): 0145 f = StringIO.StringIO() 0146 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8") 0147 f2.write(u"a") 0148 f2.close() 0149 # Python used to crash on this at exit because of a refcount 0150 # bug in _codecsmodule.c 0151 0152 # From RFC 3492 0153 punycode_testcases = [ 0154 # A Arabic (Egyptian): 0155 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 0156 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F", 0157 "egbpdaj6bu4bxfgehfvwxn"), 0158 # B Chinese (simplified): 0159 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587", 0160 "ihqwcrb4cv8a8dqg056pqjye"), 0161 # C Chinese (traditional): 0162 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587", 0163 "ihqwctvzc91f659drss3x8bo0yb"), 0164 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky 0165 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074" 0166 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D" 0167 u"\u0065\u0073\u006B\u0079", 0168 "Proprostnemluvesky-uyb24dma41a"), 0169 # E Hebrew: 0170 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8" 0171 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2" 0172 u"\u05D1\u05E8\u05D9\u05EA", 0173 "4dbcagdahymbxekheh6e0a7fei0b"), 0174 # F Hindi (Devanagari): 0175 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D" 0176 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" 0177 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" 0178 u"\u0939\u0948\u0902", 0179 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), 0180 0181 #(G) Japanese (kanji and hiragana): 0182 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092" 0183 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", 0184 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), 0185 0186 # (H) Korean (Hangul syllables): 0187 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774" 0188 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74" 0189 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C", 0190 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" 0191 "psd879ccm6fea98c"), 0192 0193 # (I) Russian (Cyrillic): 0194 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E" 0195 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440" 0196 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A" 0197 u"\u0438", 0198 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"), 0199 0200 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol 0201 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070" 0202 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070" 0203 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061" 0204 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070" 0205 u"\u0061\u00F1\u006F\u006C", 0206 "PorqunopuedensimplementehablarenEspaol-fmd56a"), 0207 0208 # (K) Vietnamese: 0209 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\ 0210 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t 0211 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B" 0212 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068" 0213 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067" 0214 u"\u0056\u0069\u1EC7\u0074", 0215 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), 0216 0217 0218 #(L) 3<nen>B<gumi><kinpachi><sensei> 0219 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F", 0220 "3B-ww4c5e180e575a65lsy2b"), 0221 0222 # (M) <amuro><namie>-with-SUPER-MONKEYS 0223 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074" 0224 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D" 0225 u"\u004F\u004E\u004B\u0045\u0059\u0053", 0226 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), 0227 0228 # (N) Hello-Another-Way-<sorezore><no><basho> 0229 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F" 0230 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D" 0231 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240", 0232 "Hello-Another-Way--fc4qua05auwb3674vfr0b"), 0233 0234 # (O) <hitotsu><yane><no><shita>2 0235 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032", 0236 "2-u9tlzr9756bt3uc0v"), 0237 0238 # (P) Maji<de>Koi<suru>5<byou><mae> 0239 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059" 0240 u"\u308B\u0035\u79D2\u524D", 0241 "MajiKoi5-783gue6qz075azm5e"), 0242 0243 # (Q) <pafii>de<runba> 0244 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0", 0245 "de-jg4avhby1noc0d"), 0246 0247 # (R) <sono><supiido><de> 0248 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067", 0249 "d9juau41awczczp"), 0250 0251 # (S) -> $1.00 <- 0252 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020" 0253 u"\u003C\u002D", 0254 "-> $1.00 <--") 0255 ] 0256 0257 for i in punycode_testcases: 0258 if len(i)!=2: 0259 print repr(i) 0260 0261 class PunycodeTest(unittest.TestCase): 0262 def test_encode(self): 0263 for uni, puny in punycode_testcases: 0264 # Need to convert both strings to lower case, since 0265 # some of the extended encodings use upper case, but our 0266 # code produces only lower case. Converting just puny to 0267 # lower is also insufficient, since some of the input characters 0268 # are upper case. 0269 self.assertEquals(uni.encode("punycode").lower(), puny.lower()) 0270 0271 def test_decode(self): 0272 for uni, puny in punycode_testcases: 0273 self.assertEquals(uni, puny.decode("punycode")) 0274 0275 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html 0276 nameprep_tests = [ 0277 # 3.1 Map to nothing. 0278 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar' 0279 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef' 0280 '\xb8\x8f\xef\xbb\xbf', 0281 'foobarbaz'), 0282 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045. 0283 ('CAFE', 0284 'cafe'), 0285 # 3.3 Case folding 8bit U+00DF (german sharp s). 0286 # The original test case is bogus; it says \xc3\xdf 0287 ('\xc3\x9f', 0288 'ss'), 0289 # 3.4 Case folding U+0130 (turkish capital I with dot). 0290 ('\xc4\xb0', 0291 'i\xcc\x87'), 0292 # 3.5 Case folding multibyte U+0143 U+037A. 0293 ('\xc5\x83\xcd\xba', 0294 '\xc5\x84 \xce\xb9'), 0295 # 3.6 Case folding U+2121 U+33C6 U+1D7BB. 0296 # XXX: skip this as it fails in UCS-2 mode 0297 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb', 0298 # 'telc\xe2\x88\x95kg\xcf\x83'), 0299 (None, None), 0300 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA. 0301 ('j\xcc\x8c\xc2\xa0\xc2\xaa', 0302 '\xc7\xb0 a'), 0303 # 3.8 Case folding U+1FB7 and normalization. 0304 ('\xe1\xbe\xb7', 0305 '\xe1\xbe\xb6\xce\xb9'), 0306 # 3.9 Self-reverting case folding U+01F0 and normalization. 0307 # The original test case is bogus, it says `\xc7\xf0' 0308 ('\xc7\xb0', 0309 '\xc7\xb0'), 0310 # 3.10 Self-reverting case folding U+0390 and normalization. 0311 ('\xce\x90', 0312 '\xce\x90'), 0313 # 3.11 Self-reverting case folding U+03B0 and normalization. 0314 ('\xce\xb0', 0315 '\xce\xb0'), 0316 # 3.12 Self-reverting case folding U+1E96 and normalization. 0317 ('\xe1\xba\x96', 0318 '\xe1\xba\x96'), 0319 # 3.13 Self-reverting case folding U+1F56 and normalization. 0320 ('\xe1\xbd\x96', 0321 '\xe1\xbd\x96'), 0322 # 3.14 ASCII space character U+0020. 0323 (' ', 0324 ' '), 0325 # 3.15 Non-ASCII 8bit space character U+00A0. 0326 ('\xc2\xa0', 0327 ' '), 0328 # 3.16 Non-ASCII multibyte space character U+1680. 0329 ('\xe1\x9a\x80', 0330 None), 0331 # 3.17 Non-ASCII multibyte space character U+2000. 0332 ('\xe2\x80\x80', 0333 ' '), 0334 # 3.18 Zero Width Space U+200b. 0335 ('\xe2\x80\x8b', 0336 ''), 0337 # 3.19 Non-ASCII multibyte space character U+3000. 0338 ('\xe3\x80\x80', 0339 ' '), 0340 # 3.20 ASCII control characters U+0010 U+007F. 0341 ('\x10\x7f', 0342 '\x10\x7f'), 0343 # 3.21 Non-ASCII 8bit control character U+0085. 0344 ('\xc2\x85', 0345 None), 0346 # 3.22 Non-ASCII multibyte control character U+180E. 0347 ('\xe1\xa0\x8e', 0348 None), 0349 # 3.23 Zero Width No-Break Space U+FEFF. 0350 ('\xef\xbb\xbf', 0351 ''), 0352 # 3.24 Non-ASCII control character U+1D175. 0353 ('\xf0\x9d\x85\xb5', 0354 None), 0355 # 3.25 Plane 0 private use character U+F123. 0356 ('\xef\x84\xa3', 0357 None), 0358 # 3.26 Plane 15 private use character U+F1234. 0359 ('\xf3\xb1\x88\xb4', 0360 None), 0361 # 3.27 Plane 16 private use character U+10F234. 0362 ('\xf4\x8f\x88\xb4', 0363 None), 0364 # 3.28 Non-character code point U+8FFFE. 0365 ('\xf2\x8f\xbf\xbe', 0366 None), 0367 # 3.29 Non-character code point U+10FFFF. 0368 ('\xf4\x8f\xbf\xbf', 0369 None), 0370 # 3.30 Surrogate code U+DF42. 0371 ('\xed\xbd\x82', 0372 None), 0373 # 3.31 Non-plain text character U+FFFD. 0374 ('\xef\xbf\xbd', 0375 None), 0376 # 3.32 Ideographic description character U+2FF5. 0377 ('\xe2\xbf\xb5', 0378 None), 0379 # 3.33 Display property character U+0341. 0380 ('\xcd\x81', 0381 '\xcc\x81'), 0382 # 3.34 Left-to-right mark U+200E. 0383 ('\xe2\x80\x8e', 0384 None), 0385 # 3.35 Deprecated U+202A. 0386 ('\xe2\x80\xaa', 0387 None), 0388 # 3.36 Language tagging character U+E0001. 0389 ('\xf3\xa0\x80\x81', 0390 None), 0391 # 3.37 Language tagging character U+E0042. 0392 ('\xf3\xa0\x81\x82', 0393 None), 0394 # 3.38 Bidi: RandALCat character U+05BE and LCat characters. 0395 ('foo\xd6\xbebar', 0396 None), 0397 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters. 0398 ('foo\xef\xb5\x90bar', 0399 None), 0400 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters. 0401 ('foo\xef\xb9\xb6bar', 0402 'foo \xd9\x8ebar'), 0403 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031. 0404 ('\xd8\xa71', 0405 None), 0406 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628. 0407 ('\xd8\xa71\xd8\xa8', 0408 '\xd8\xa71\xd8\xa8'), 0409 # 3.43 Unassigned code point U+E0002. 0410 # Skip this test as we allow unassigned 0411 #('\xf3\xa0\x80\x82', 0412 # None), 0413 (None, None), 0414 # 3.44 Larger test (shrinking). 0415 # Original test case reads \xc3\xdf 0416 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2' 0417 '\xaa\xce\xb0\xe2\x80\x80', 0418 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '), 0419 # 3.45 Larger test (expanding). 0420 # Original test case reads \xc3\x9f 0421 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c' 0422 '\x80', 0423 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3' 0424 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82' 0425 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88') 0426 ] 0427 0428 0429 class NameprepTest(unittest.TestCase): 0430 def test_nameprep(self): 0431 from encodings.idna import nameprep 0432 for pos, (orig, prepped) in enumerate(nameprep_tests): 0433 if orig is None: 0434 # Skipped 0435 continue 0436 # The Unicode strings are given in UTF-8 0437 orig = unicode(orig, "utf-8") 0438 if prepped is None: 0439 # Input contains prohibited characters 0440 self.assertRaises(UnicodeError, nameprep, orig) 0441 else: 0442 prepped = unicode(prepped, "utf-8") 0443 try: 0444 self.assertEquals(nameprep(orig), prepped) 0445 except Exception,e: 0446 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) 0447 0448 class CodecTest(unittest.TestCase): 0449 def test_builtin(self): 0450 self.assertEquals(unicode("python.org", "idna"), u"python.org") 0451 0452 class CodecsModuleTest(unittest.TestCase): 0453 0454 def test_decode(self): 0455 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'), 0456 u'\xe4\xf6\xfc') 0457 self.assertRaises(TypeError, codecs.decode) 0458 self.assertEquals(codecs.decode('abc'), u'abc') 0459 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii') 0460 0461 def test_encode(self): 0462 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'), 0463 '\xe4\xf6\xfc') 0464 self.assertRaises(TypeError, codecs.encode) 0465 self.assertEquals(codecs.encode(u'abc'), 'abc') 0466 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii') 0467 0468 def test_register(self): 0469 self.assertRaises(TypeError, codecs.register) 0470 0471 def test_lookup(self): 0472 self.assertRaises(TypeError, codecs.lookup) 0473 self.assertRaises(LookupError, codecs.lookup, "__spam__") 0474 0475 class StreamReaderTest(unittest.TestCase): 0476 0477 def setUp(self): 0478 self.reader = codecs.getreader('utf-8') 0479 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80') 0480 0481 def test_readlines(self): 0482 f = self.reader(self.stream) 0483 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00']) 0484 0485 def test_main(): 0486 test_support.run_unittest( 0487 UTF16Test, 0488 UTF16LETest, 0489 UTF16BETest, 0490 UTF8Test, 0491 EscapeDecodeTest, 0492 RecodingTest, 0493 PunycodeTest, 0494 NameprepTest, 0495 CodecTest, 0496 CodecsModuleTest, 0497 StreamReaderTest 0498 ) 0499 0500 0501 if __name__ == "__main__": 0502 test_main() 0503
Generated by PyXR 0.9.4