0001 # -*- coding: iso-8859-1 -*- 0002 """ Test script for the Unicode implementation. 0003 0004 Written by Marc-Andre Lemburg (mal@lemburg.com). 0005 0006 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 0007 0008 """#" 0009 import unittest, sys, string, codecs, new 0010 from test import test_support, string_tests 0011 0012 class UnicodeTest( 0013 string_tests.CommonTest, 0014 string_tests.MixinStrUnicodeUserStringTest, 0015 string_tests.MixinStrUnicodeTest, 0016 ): 0017 type2test = unicode 0018 0019 def checkequalnofix(self, result, object, methodname, *args): 0020 method = getattr(object, methodname) 0021 realresult = method(*args) 0022 self.assertEqual(realresult, result) 0023 self.assert_(type(realresult) is type(result)) 0024 0025 # if the original is returned make sure that 0026 # this doesn't happen with subclasses 0027 if realresult is object: 0028 class usub(unicode): 0029 def __repr__(self): 0030 return 'usub(%r)' % unicode.__repr__(self) 0031 object = usub(object) 0032 method = getattr(object, methodname) 0033 realresult = method(*args) 0034 self.assertEqual(realresult, result) 0035 self.assert_(object is not realresult) 0036 0037 def test_literals(self): 0038 self.assertEqual(u'\xff', u'\u00ff') 0039 self.assertEqual(u'\uffff', u'\U0000ffff') 0040 self.assertRaises(UnicodeError, eval, 'u\'\\Ufffffffe\'') 0041 self.assertRaises(UnicodeError, eval, 'u\'\\Uffffffff\'') 0042 self.assertRaises(UnicodeError, eval, 'u\'\\U%08x\'' % 0x110000) 0043 0044 def test_repr(self): 0045 if not sys.platform.startswith('java'): 0046 # Test basic sanity of repr() 0047 self.assertEqual(repr(u'abc'), "u'abc'") 0048 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'") 0049 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'") 0050 self.assertEqual(repr(u'\\c'), "u'\\\\c'") 0051 self.assertEqual(repr(u'\\'), "u'\\\\'") 0052 self.assertEqual(repr(u'\n'), "u'\\n'") 0053 self.assertEqual(repr(u'\r'), "u'\\r'") 0054 self.assertEqual(repr(u'\t'), "u'\\t'") 0055 self.assertEqual(repr(u'\b'), "u'\\x08'") 0056 self.assertEqual(repr(u"'\""), """u'\\'"'""") 0057 self.assertEqual(repr(u"'\""), """u'\\'"'""") 0058 self.assertEqual(repr(u"'"), '''u"'"''') 0059 self.assertEqual(repr(u'"'), """u'"'""") 0060 latin1repr = ( 0061 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r" 0062 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a" 0063 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI" 0064 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f" 0065 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d" 0066 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b" 0067 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9" 0068 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7" 0069 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5" 0070 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3" 0071 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1" 0072 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef" 0073 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd" 0074 "\\xfe\\xff'") 0075 testrepr = repr(u''.join(map(unichr, xrange(256)))) 0076 self.assertEqual(testrepr, latin1repr) 0077 0078 def test_count(self): 0079 string_tests.CommonTest.test_count(self) 0080 # check mixed argument types 0081 self.checkequalnofix(3, 'aaa', 'count', u'a') 0082 self.checkequalnofix(0, 'aaa', 'count', u'b') 0083 self.checkequalnofix(3, u'aaa', 'count', 'a') 0084 self.checkequalnofix(0, u'aaa', 'count', 'b') 0085 self.checkequalnofix(0, u'aaa', 'count', 'b') 0086 self.checkequalnofix(1, u'aaa', 'count', 'a', -1) 0087 self.checkequalnofix(3, u'aaa', 'count', 'a', -10) 0088 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1) 0089 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10) 0090 0091 def test_find(self): 0092 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc') 0093 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1) 0094 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4) 0095 0096 self.assertRaises(TypeError, u'hello'.find) 0097 self.assertRaises(TypeError, u'hello'.find, 42) 0098 0099 def test_rfind(self): 0100 string_tests.CommonTest.test_rfind(self) 0101 # check mixed argument types 0102 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc') 0103 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'') 0104 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '') 0105 0106 def test_index(self): 0107 string_tests.CommonTest.test_index(self) 0108 # check mixed argument types 0109 for (t1, t2) in ((str, unicode), (unicode, str)): 0110 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('')) 0111 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def')) 0112 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc')) 0113 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1) 0114 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib')) 0115 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1) 0116 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8) 0117 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1) 0118 0119 def test_rindex(self): 0120 string_tests.CommonTest.test_rindex(self) 0121 # check mixed argument types 0122 for (t1, t2) in ((str, unicode), (unicode, str)): 0123 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2('')) 0124 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def')) 0125 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc')) 0126 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1) 0127 0128 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib')) 0129 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1) 0130 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1) 0131 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8) 0132 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1) 0133 0134 def test_translate(self): 0135 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None}) 0136 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')}) 0137 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'}) 0138 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'}) 0139 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''}) 0140 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'}) 0141 0142 self.assertRaises(TypeError, u'hello'.translate) 0143 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''}) 0144 0145 def test_split(self): 0146 string_tests.CommonTest.test_split(self) 0147 0148 # Mixed arguments 0149 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//') 0150 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//') 0151 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test') 0152 0153 def test_join(self): 0154 string_tests.MixinStrUnicodeUserStringTest.test_join(self) 0155 0156 # mixed arguments 0157 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd']) 0158 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd')) 0159 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz')) 0160 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd']) 0161 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd']) 0162 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd')) 0163 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz')) 0164 0165 def test_strip(self): 0166 string_tests.CommonTest.test_strip(self) 0167 self.assertRaises(UnicodeError, u"hello".strip, "\xff") 0168 0169 def test_replace(self): 0170 string_tests.CommonTest.test_replace(self) 0171 0172 # method call forwarded from str implementation because of unicode argument 0173 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1) 0174 self.assertRaises(TypeError, 'replace'.replace, u"r", 42) 0175 0176 def test_comparison(self): 0177 # Comparisons: 0178 self.assertEqual(u'abc', 'abc') 0179 self.assertEqual('abc', u'abc') 0180 self.assertEqual(u'abc', u'abc') 0181 self.assert_(u'abcd' > 'abc') 0182 self.assert_('abcd' > u'abc') 0183 self.assert_(u'abcd' > u'abc') 0184 self.assert_(u'abc' < 'abcd') 0185 self.assert_('abc' < u'abcd') 0186 self.assert_(u'abc' < u'abcd') 0187 0188 if 0: 0189 # Move these tests to a Unicode collation module test... 0190 # Testing UTF-16 code point order comparisons... 0191 0192 # No surrogates, no fixup required. 0193 self.assert_(u'\u0061' < u'\u20ac') 0194 # Non surrogate below surrogate value, no fixup required 0195 self.assert_(u'\u0061' < u'\ud800\udc02') 0196 0197 # Non surrogate above surrogate value, fixup required 0198 def test_lecmp(s, s2): 0199 self.assert_(s < s2) 0200 0201 def test_fixup(s): 0202 s2 = u'\ud800\udc01' 0203 test_lecmp(s, s2) 0204 s2 = u'\ud900\udc01' 0205 test_lecmp(s, s2) 0206 s2 = u'\uda00\udc01' 0207 test_lecmp(s, s2) 0208 s2 = u'\udb00\udc01' 0209 test_lecmp(s, s2) 0210 s2 = u'\ud800\udd01' 0211 test_lecmp(s, s2) 0212 s2 = u'\ud900\udd01' 0213 test_lecmp(s, s2) 0214 s2 = u'\uda00\udd01' 0215 test_lecmp(s, s2) 0216 s2 = u'\udb00\udd01' 0217 test_lecmp(s, s2) 0218 s2 = u'\ud800\ude01' 0219 test_lecmp(s, s2) 0220 s2 = u'\ud900\ude01' 0221 test_lecmp(s, s2) 0222 s2 = u'\uda00\ude01' 0223 test_lecmp(s, s2) 0224 s2 = u'\udb00\ude01' 0225 test_lecmp(s, s2) 0226 s2 = u'\ud800\udfff' 0227 test_lecmp(s, s2) 0228 s2 = u'\ud900\udfff' 0229 test_lecmp(s, s2) 0230 s2 = u'\uda00\udfff' 0231 test_lecmp(s, s2) 0232 s2 = u'\udb00\udfff' 0233 test_lecmp(s, s2) 0234 0235 test_fixup(u'\ue000') 0236 test_fixup(u'\uff61') 0237 0238 # Surrogates on both sides, no fixup required 0239 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56') 0240 0241 def test_islower(self): 0242 string_tests.MixinStrUnicodeUserStringTest.test_islower(self) 0243 self.checkequalnofix(False, u'\u1FFc', 'islower') 0244 0245 def test_isupper(self): 0246 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self) 0247 if not sys.platform.startswith('java'): 0248 self.checkequalnofix(False, u'\u1FFc', 'isupper') 0249 0250 def test_istitle(self): 0251 string_tests.MixinStrUnicodeUserStringTest.test_title(self) 0252 self.checkequalnofix(True, u'\u1FFc', 'istitle') 0253 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle') 0254 0255 def test_isspace(self): 0256 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self) 0257 self.checkequalnofix(True, u'\u2000', 'isspace') 0258 self.checkequalnofix(True, u'\u200a', 'isspace') 0259 self.checkequalnofix(False, u'\u2014', 'isspace') 0260 0261 def test_isalpha(self): 0262 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self) 0263 self.checkequalnofix(True, u'\u1FFc', 'isalpha') 0264 0265 def test_isdecimal(self): 0266 self.checkequalnofix(False, u'', 'isdecimal') 0267 self.checkequalnofix(False, u'a', 'isdecimal') 0268 self.checkequalnofix(True, u'0', 'isdecimal') 0269 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE 0270 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER 0271 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO 0272 self.checkequalnofix(True, u'0123456789', 'isdecimal') 0273 self.checkequalnofix(False, u'0123456789a', 'isdecimal') 0274 0275 self.checkraises(TypeError, 'abc', 'isdecimal', 42) 0276 0277 def test_isdigit(self): 0278 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self) 0279 self.checkequalnofix(True, u'\u2460', 'isdigit') 0280 self.checkequalnofix(False, u'\xbc', 'isdigit') 0281 self.checkequalnofix(True, u'\u0660', 'isdigit') 0282 0283 def test_isnumeric(self): 0284 self.checkequalnofix(False, u'', 'isnumeric') 0285 self.checkequalnofix(False, u'a', 'isnumeric') 0286 self.checkequalnofix(True, u'0', 'isnumeric') 0287 self.checkequalnofix(True, u'\u2460', 'isnumeric') 0288 self.checkequalnofix(True, u'\xbc', 'isnumeric') 0289 self.checkequalnofix(True, u'\u0660', 'isnumeric') 0290 self.checkequalnofix(True, u'0123456789', 'isnumeric') 0291 self.checkequalnofix(False, u'0123456789a', 'isnumeric') 0292 0293 self.assertRaises(TypeError, u"abc".isnumeric, 42) 0294 0295 def test_contains(self): 0296 # Testing Unicode contains method 0297 self.assert_('a' in u'abdb') 0298 self.assert_('a' in u'bdab') 0299 self.assert_('a' in u'bdaba') 0300 self.assert_('a' in u'bdba') 0301 self.assert_('a' in u'bdba') 0302 self.assert_(u'a' in u'bdba') 0303 self.assert_(u'a' not in u'bdb') 0304 self.assert_(u'a' not in 'bdb') 0305 self.assert_(u'a' in 'bdba') 0306 self.assert_(u'a' in ('a',1,None)) 0307 self.assert_(u'a' in (1,None,'a')) 0308 self.assert_(u'a' in (1,None,u'a')) 0309 self.assert_('a' in ('a',1,None)) 0310 self.assert_('a' in (1,None,'a')) 0311 self.assert_('a' in (1,None,u'a')) 0312 self.assert_('a' not in ('x',1,u'y')) 0313 self.assert_('a' not in ('x',1,None)) 0314 self.assert_(u'abcd' not in u'abcxxxx') 0315 self.assert_(u'ab' in u'abcd') 0316 self.assert_('ab' in u'abc') 0317 self.assert_(u'ab' in 'abc') 0318 self.assert_(u'ab' in (1,None,u'ab')) 0319 self.assert_(u'' in u'abc') 0320 self.assert_('' in u'abc') 0321 0322 # If the following fails either 0323 # the contains operator does not propagate UnicodeErrors or 0324 # someone has changed the default encoding 0325 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2') 0326 0327 self.assert_(u'' in '') 0328 self.assert_('' in u'') 0329 self.assert_(u'' in u'') 0330 self.assert_(u'' in 'abc') 0331 self.assert_('' in u'abc') 0332 self.assert_(u'' in u'abc') 0333 self.assert_(u'\0' not in 'abc') 0334 self.assert_('\0' not in u'abc') 0335 self.assert_(u'\0' not in u'abc') 0336 self.assert_(u'\0' in '\0abc') 0337 self.assert_('\0' in u'\0abc') 0338 self.assert_(u'\0' in u'\0abc') 0339 self.assert_(u'\0' in 'abc\0') 0340 self.assert_('\0' in u'abc\0') 0341 self.assert_(u'\0' in u'abc\0') 0342 self.assert_(u'a' in '\0abc') 0343 self.assert_('a' in u'\0abc') 0344 self.assert_(u'a' in u'\0abc') 0345 self.assert_(u'asdf' in 'asdf') 0346 self.assert_('asdf' in u'asdf') 0347 self.assert_(u'asdf' in u'asdf') 0348 self.assert_(u'asdf' not in 'asd') 0349 self.assert_('asdf' not in u'asd') 0350 self.assert_(u'asdf' not in u'asd') 0351 self.assert_(u'asdf' not in '') 0352 self.assert_('asdf' not in u'') 0353 self.assert_(u'asdf' not in u'') 0354 0355 self.assertRaises(TypeError, u"abc".__contains__) 0356 0357 def test_formatting(self): 0358 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self) 0359 # Testing Unicode formatting strings... 0360 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc') 0361 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00') 0362 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00') 0363 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50') 0364 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57') 0365 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57') 0366 if not sys.platform.startswith('java'): 0367 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'") 0368 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def') 0369 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def') 0370 0371 self.assertEqual(u'%c' % 0x1234, u'\u1234') 0372 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,)) 0373 0374 # formatting jobs delegated from the string implementation: 0375 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...') 0376 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 0377 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...') 0378 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...') 0379 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...') 0380 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...') 0381 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...') 0382 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...') 0383 self.assertEqual('...%s...' % u"abc", u'...abc...') 0384 self.assertEqual('%*s' % (5,u'abc',), u' abc') 0385 self.assertEqual('%*s' % (-5,u'abc',), u'abc ') 0386 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab') 0387 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc') 0388 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc') 0389 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc') 0390 self.assertEqual('%c' % u'a', u'a') 0391 0392 0393 def test_constructor(self): 0394 # unicode(obj) tests (this maps to PyObject_Unicode() at C level) 0395 0396 self.assertEqual( 0397 unicode(u'unicode remains unicode'), 0398 u'unicode remains unicode' 0399 ) 0400 0401 class UnicodeSubclass(unicode): 0402 pass 0403 0404 self.assertEqual( 0405 unicode(UnicodeSubclass('unicode subclass becomes unicode')), 0406 u'unicode subclass becomes unicode' 0407 ) 0408 0409 self.assertEqual( 0410 unicode('strings are converted to unicode'), 0411 u'strings are converted to unicode' 0412 ) 0413 0414 class UnicodeCompat: 0415 def __init__(self, x): 0416 self.x = x 0417 def __unicode__(self): 0418 return self.x 0419 0420 self.assertEqual( 0421 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')), 0422 u'__unicode__ compatible objects are recognized') 0423 0424 class StringCompat: 0425 def __init__(self, x): 0426 self.x = x 0427 def __str__(self): 0428 return self.x 0429 0430 self.assertEqual( 0431 unicode(StringCompat('__str__ compatible objects are recognized')), 0432 u'__str__ compatible objects are recognized' 0433 ) 0434 0435 # unicode(obj) is compatible to str(): 0436 0437 o = StringCompat('unicode(obj) is compatible to str()') 0438 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()') 0439 self.assertEqual(str(o), 'unicode(obj) is compatible to str()') 0440 0441 # %-formatting and .__unicode__() 0442 self.assertEqual(u'%s' % 0443 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"), 0444 u"u'%s' % obj uses obj.__unicode__()") 0445 self.assertEqual(u'%s' % 0446 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"), 0447 u"u'%s' % obj falls back to obj.__str__()") 0448 0449 for obj in (123, 123.45, 123L): 0450 self.assertEqual(unicode(obj), unicode(str(obj))) 0451 0452 # unicode(obj, encoding, error) tests (this maps to 0453 # PyUnicode_FromEncodedObject() at C level) 0454 0455 if not sys.platform.startswith('java'): 0456 self.assertRaises( 0457 TypeError, 0458 unicode, 0459 u'decoding unicode is not supported', 0460 'utf-8', 0461 'strict' 0462 ) 0463 0464 self.assertEqual( 0465 unicode('strings are decoded to unicode', 'utf-8', 'strict'), 0466 u'strings are decoded to unicode' 0467 ) 0468 0469 if not sys.platform.startswith('java'): 0470 self.assertEqual( 0471 unicode( 0472 buffer('character buffers are decoded to unicode'), 0473 'utf-8', 0474 'strict' 0475 ), 0476 u'character buffers are decoded to unicode' 0477 ) 0478 0479 self.assertRaises(TypeError, unicode, 42, 42, 42) 0480 0481 def test_codecs_utf7(self): 0482 utfTests = [ 0483 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example 0484 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example 0485 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example 0486 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example 0487 (u'+', '+-'), 0488 (u'+-', '+--'), 0489 (u'+?', '+-?'), 0490 (u'\?', '+AFw?'), 0491 (u'+?', '+-?'), 0492 (ur'\\?', '+AFwAXA?'), 0493 (ur'\\\?', '+AFwAXABc?'), 0494 (ur'++--', '+-+---') 0495 ] 0496 0497 for (x, y) in utfTests: 0498 self.assertEqual(x.encode('utf-7'), y) 0499 0500 # surrogates not supported 0501 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7') 0502 0503 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd') 0504 0505 def test_codecs_utf8(self): 0506 self.assertEqual(u''.encode('utf-8'), '') 0507 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac') 0508 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82') 0509 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96') 0510 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80') 0511 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80') 0512 self.assertEqual( 0513 (u'\ud800\udc02'*1000).encode('utf-8'), 0514 '\xf0\x90\x80\x82'*1000 0515 ) 0516 self.assertEqual( 0517 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f' 0518 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00' 0519 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c' 0520 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067' 0521 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das' 0522 u' Nunstuck git und'.encode('utf-8'), 0523 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81' 0524 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3' 0525 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe' 0526 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83' 0527 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8' 0528 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81' 0529 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81' 0530 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3' 0531 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf' 0532 '\xe3\x80\x8cWenn ist das Nunstuck git und' 0533 ) 0534 0535 # UTF-8 specific decoding tests 0536 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' ) 0537 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' ) 0538 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' ) 0539 0540 # Other possible utf-8 test cases: 0541 # * strict decoding testing for all of the 0542 # UTF8_ERROR cases in PyUnicode_DecodeUTF8 0543 0544 def test_codecs_idna(self): 0545 # Test whether trailing dot is preserved 0546 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.") 0547 0548 def test_codecs_errors(self): 0549 # Error handling (encoding) 0550 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii') 0551 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict') 0552 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x") 0553 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x") 0554 0555 # Error handling (decoding) 0556 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii') 0557 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict') 0558 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x") 0559 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x') 0560 0561 # Error handling (unknown character names) 0562 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx") 0563 0564 # Error handling (truncated escape sequence) 0565 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape") 0566 0567 # Error handling (bad decoder return) 0568 def search_function(encoding): 0569 def decode1(input, errors="strict"): 0570 return 42 # not a tuple 0571 def encode1(input, errors="strict"): 0572 return 42 # not a tuple 0573 def encode2(input, errors="strict"): 0574 return (42, 42) # no unicode 0575 def decode2(input, errors="strict"): 0576 return (42, 42) # no unicode 0577 if encoding=="test.unicode1": 0578 return (encode1, decode1, None, None) 0579 elif encoding=="test.unicode2": 0580 return (encode2, decode2, None, None) 0581 else: 0582 return None 0583 codecs.register(search_function) 0584 self.assertRaises(TypeError, "hello".decode, "test.unicode1") 0585 self.assertRaises(TypeError, unicode, "hello", "test.unicode2") 0586 self.assertRaises(TypeError, u"hello".encode, "test.unicode1") 0587 self.assertRaises(TypeError, u"hello".encode, "test.unicode2") 0588 # executes PyUnicode_Encode() 0589 import imp 0590 self.assertRaises( 0591 ImportError, 0592 imp.find_module, 0593 "non-existing module", 0594 [u"non-existing dir"] 0595 ) 0596 0597 # Error handling (wrong arguments) 0598 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42) 0599 0600 # Error handling (PyUnicode_EncodeDecimal()) 0601 self.assertRaises(UnicodeError, int, u"\u0200") 0602 0603 def test_codecs(self): 0604 # Encoding 0605 self.assertEqual(u'hello'.encode('ascii'), 'hello') 0606 self.assertEqual(u'hello'.encode('utf-7'), 'hello') 0607 self.assertEqual(u'hello'.encode('utf-8'), 'hello') 0608 self.assertEqual(u'hello'.encode('utf8'), 'hello') 0609 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000') 0610 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o') 0611 self.assertEqual(u'hello'.encode('latin-1'), 'hello') 0612 0613 # Roundtrip safety for BMP (just the first 1024 chars) 0614 u = u''.join(map(unichr, xrange(1024))) 0615 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 0616 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'): 0617 self.assertEqual(unicode(u.encode(encoding),encoding), u) 0618 0619 # Roundtrip safety for BMP (just the first 256 chars) 0620 u = u''.join(map(unichr, xrange(256))) 0621 for encoding in ('latin-1',): 0622 self.assertEqual(unicode(u.encode(encoding),encoding), u) 0623 0624 # Roundtrip safety for BMP (just the first 128 chars) 0625 u = u''.join(map(unichr, xrange(128))) 0626 for encoding in ('ascii',): 0627 self.assertEqual(unicode(u.encode(encoding),encoding), u) 0628 0629 # Roundtrip safety for non-BMP (just a few chars) 0630 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005' 0631 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 0632 #'raw_unicode_escape', 0633 'unicode_escape', 'unicode_internal'): 0634 self.assertEqual(unicode(u.encode(encoding),encoding), u) 0635 0636 # UTF-8 must be roundtrip safe for all UCS-2 code points 0637 # This excludes surrogates: in the full range, there would be 0638 # a surrogate pair (\udbff\udc00), which gets converted back 0639 # to a non-BMP character (\U0010fc00) 0640 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000))) 0641 for encoding in ('utf-8',): 0642 self.assertEqual(unicode(u.encode(encoding),encoding), u) 0643 0644 def test_codecs_charmap(self): 0645 # 0-127 0646 s = ''.join(map(chr, xrange(128))) 0647 for encoding in ( 0648 'cp037', 'cp1026', 0649 'cp437', 'cp500', 'cp737', 'cp775', 'cp850', 0650 'cp852', 'cp855', 'cp860', 'cp861', 'cp862', 0651 'cp863', 'cp865', 'cp866', 0652 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 0653 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 0654 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1', 0655 'mac_cyrillic', 'mac_latin2', 0656 0657 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 0658 'cp1256', 'cp1257', 'cp1258', 0659 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 0660 0661 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', 0662 'cp1006', 'iso8859_8', 0663 0664 ### These have undefined mappings: 0665 #'cp424', 0666 0667 ### These fail the round-trip: 0668 #'cp875' 0669 0670 ): 0671 self.assertEqual(unicode(s, encoding).encode(encoding), s) 0672 0673 # 128-255 0674 s = ''.join(map(chr, xrange(128, 256))) 0675 for encoding in ( 0676 'cp037', 'cp1026', 0677 'cp437', 'cp500', 'cp737', 'cp775', 'cp850', 0678 'cp852', 'cp855', 'cp860', 'cp861', 'cp862', 0679 'cp863', 'cp865', 'cp866', 0680 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 0681 'iso8859_2', 'iso8859_4', 'iso8859_5', 0682 'iso8859_9', 'koi8_r', 'latin_1', 0683 'mac_cyrillic', 'mac_latin2', 0684 0685 ### These have undefined mappings: 0686 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 0687 #'cp1256', 'cp1257', 'cp1258', 0688 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 0689 #'iso8859_3', 'iso8859_6', 'iso8859_7', 0690 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', 0691 0692 ### These fail the round-trip: 0693 #'cp1006', 'cp875', 'iso8859_8', 0694 0695 ): 0696 self.assertEqual(unicode(s, encoding).encode(encoding), s) 0697 0698 def test_concatenation(self): 0699 self.assertEqual((u"abc" u"def"), u"abcdef") 0700 self.assertEqual(("abc" u"def"), u"abcdef") 0701 self.assertEqual((u"abc" "def"), u"abcdef") 0702 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi") 0703 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi") 0704 0705 def test_printing(self): 0706 class BitBucket: 0707 def write(self, text): 0708 pass 0709 0710 out = BitBucket() 0711 print >>out, u'abc' 0712 print >>out, u'abc', u'def' 0713 print >>out, u'abc', 'def' 0714 print >>out, 'abc', u'def' 0715 print >>out, u'abc\n' 0716 print >>out, u'abc\n', 0717 print >>out, u'abc\n', 0718 print >>out, u'def\n' 0719 print >>out, u'def\n' 0720 0721 def test_ucs4(self): 0722 if sys.maxunicode == 0xFFFF: 0723 return 0724 x = u'\U00100000' 0725 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") 0726 self.assertEqual(x, y) 0727 0728 def test_main(): 0729 test_support.run_unittest(UnicodeTest) 0730 0731 if __name__ == "__main__": 0732 test_main() 0733
Generated by PyXR 0.9.4