PyXR

c:\python24\lib \ test \ test_unicode.py



0001 # -*- coding: iso-8859-1 -*-
0002 """ Test script for the Unicode implementation.
0003 
0004 Written by Marc-Andre Lemburg (mal@lemburg.com).
0005 
0006 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
0007 
0008 """#"
0009 import unittest, sys, string, codecs, new
0010 from test import test_support, string_tests
0011 
0012 class UnicodeTest(
0013     string_tests.CommonTest,
0014     string_tests.MixinStrUnicodeUserStringTest,
0015     string_tests.MixinStrUnicodeTest,
0016     ):
0017     type2test = unicode
0018 
0019     def checkequalnofix(self, result, object, methodname, *args):
0020         method = getattr(object, methodname)
0021         realresult = method(*args)
0022         self.assertEqual(realresult, result)
0023         self.assert_(type(realresult) is type(result))
0024 
0025         # if the original is returned make sure that
0026         # this doesn't happen with subclasses
0027         if realresult is object:
0028             class usub(unicode):
0029                 def __repr__(self):
0030                     return 'usub(%r)' % unicode.__repr__(self)
0031             object = usub(object)
0032             method = getattr(object, methodname)
0033             realresult = method(*args)
0034             self.assertEqual(realresult, result)
0035             self.assert_(object is not realresult)
0036 
0037     def test_literals(self):
0038         self.assertEqual(u'\xff', u'\u00ff')
0039         self.assertEqual(u'\uffff', u'\U0000ffff')
0040         self.assertRaises(UnicodeError, eval, 'u\'\\Ufffffffe\'')
0041         self.assertRaises(UnicodeError, eval, 'u\'\\Uffffffff\'')
0042         self.assertRaises(UnicodeError, eval, 'u\'\\U%08x\'' % 0x110000)
0043 
0044     def test_repr(self):
0045         if not sys.platform.startswith('java'):
0046             # Test basic sanity of repr()
0047             self.assertEqual(repr(u'abc'), "u'abc'")
0048             self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
0049             self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
0050             self.assertEqual(repr(u'\\c'), "u'\\\\c'")
0051             self.assertEqual(repr(u'\\'), "u'\\\\'")
0052             self.assertEqual(repr(u'\n'), "u'\\n'")
0053             self.assertEqual(repr(u'\r'), "u'\\r'")
0054             self.assertEqual(repr(u'\t'), "u'\\t'")
0055             self.assertEqual(repr(u'\b'), "u'\\x08'")
0056             self.assertEqual(repr(u"'\""), """u'\\'"'""")
0057             self.assertEqual(repr(u"'\""), """u'\\'"'""")
0058             self.assertEqual(repr(u"'"), '''u"'"''')
0059             self.assertEqual(repr(u'"'), """u'"'""")
0060             latin1repr = (
0061                 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
0062                 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
0063                 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
0064                 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
0065                 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
0066                 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
0067                 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
0068                 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
0069                 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
0070                 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
0071                 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
0072                 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
0073                 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
0074                 "\\xfe\\xff'")
0075             testrepr = repr(u''.join(map(unichr, xrange(256))))
0076             self.assertEqual(testrepr, latin1repr)
0077 
0078     def test_count(self):
0079         string_tests.CommonTest.test_count(self)
0080         # check mixed argument types
0081         self.checkequalnofix(3,  'aaa', 'count', u'a')
0082         self.checkequalnofix(0,  'aaa', 'count', u'b')
0083         self.checkequalnofix(3, u'aaa', 'count',  'a')
0084         self.checkequalnofix(0, u'aaa', 'count',  'b')
0085         self.checkequalnofix(0, u'aaa', 'count',  'b')
0086         self.checkequalnofix(1, u'aaa', 'count',  'a', -1)
0087         self.checkequalnofix(3, u'aaa', 'count',  'a', -10)
0088         self.checkequalnofix(2, u'aaa', 'count',  'a', 0, -1)
0089         self.checkequalnofix(0, u'aaa', 'count',  'a', 0, -10)
0090 
0091     def test_find(self):
0092         self.checkequalnofix(0,  u'abcdefghiabc', 'find', u'abc')
0093         self.checkequalnofix(9,  u'abcdefghiabc', 'find', u'abc', 1)
0094         self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
0095 
0096         self.assertRaises(TypeError, u'hello'.find)
0097         self.assertRaises(TypeError, u'hello'.find, 42)
0098 
0099     def test_rfind(self):
0100         string_tests.CommonTest.test_rfind(self)
0101         # check mixed argument types
0102         self.checkequalnofix(9,   'abcdefghiabc', 'rfind', u'abc')
0103         self.checkequalnofix(12,  'abcdefghiabc', 'rfind', u'')
0104         self.checkequalnofix(12, u'abcdefghiabc', 'rfind',  '')
0105 
0106     def test_index(self):
0107         string_tests.CommonTest.test_index(self)
0108         # check mixed argument types
0109         for (t1, t2) in ((str, unicode), (unicode, str)):
0110             self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2(''))
0111             self.checkequalnofix(3, t1('abcdefghiabc'), 'index',  t2('def'))
0112             self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2('abc'))
0113             self.checkequalnofix(9, t1('abcdefghiabc'), 'index',  t2('abc'), 1)
0114             self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
0115             self.assertRaises(ValueError, t1('abcdefghiab').index,  t2('abc'), 1)
0116             self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), 8)
0117             self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), -1)
0118 
0119     def test_rindex(self):
0120         string_tests.CommonTest.test_rindex(self)
0121         # check mixed argument types
0122         for (t1, t2) in ((str, unicode), (unicode, str)):
0123             self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex',  t2(''))
0124             self.checkequalnofix(3,  t1('abcdefghiabc'), 'rindex',  t2('def'))
0125             self.checkequalnofix(9,  t1('abcdefghiabc'), 'rindex',  t2('abc'))
0126             self.checkequalnofix(0,  t1('abcdefghiabc'), 'rindex',  t2('abc'), 0, -1)
0127 
0128             self.assertRaises(ValueError, t1('abcdefghiabc').rindex,  t2('hib'))
0129             self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('def'), 1)
0130             self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('abc'), 0, -1)
0131             self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, 8)
0132             self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, -1)
0133 
0134     def test_translate(self):
0135         self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
0136         self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
0137         self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
0138         self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
0139         self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
0140         self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
0141 
0142         self.assertRaises(TypeError, u'hello'.translate)
0143         self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
0144 
0145     def test_split(self):
0146         string_tests.CommonTest.test_split(self)
0147 
0148         # Mixed arguments
0149         self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
0150         self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
0151         self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
0152 
0153     def test_join(self):
0154         string_tests.MixinStrUnicodeUserStringTest.test_join(self)
0155 
0156         # mixed arguments
0157         self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
0158         self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
0159         self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
0160         self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
0161         self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
0162         self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
0163         self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
0164 
0165     def test_strip(self):
0166         string_tests.CommonTest.test_strip(self)
0167         self.assertRaises(UnicodeError, u"hello".strip, "\xff")
0168 
0169     def test_replace(self):
0170         string_tests.CommonTest.test_replace(self)
0171 
0172         # method call forwarded from str implementation because of unicode argument
0173         self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
0174         self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
0175 
0176     def test_comparison(self):
0177         # Comparisons:
0178         self.assertEqual(u'abc', 'abc')
0179         self.assertEqual('abc', u'abc')
0180         self.assertEqual(u'abc', u'abc')
0181         self.assert_(u'abcd' > 'abc')
0182         self.assert_('abcd' > u'abc')
0183         self.assert_(u'abcd' > u'abc')
0184         self.assert_(u'abc' < 'abcd')
0185         self.assert_('abc' < u'abcd')
0186         self.assert_(u'abc' < u'abcd')
0187 
0188         if 0:
0189             # Move these tests to a Unicode collation module test...
0190             # Testing UTF-16 code point order comparisons...
0191 
0192             # No surrogates, no fixup required.
0193             self.assert_(u'\u0061' < u'\u20ac')
0194             # Non surrogate below surrogate value, no fixup required
0195             self.assert_(u'\u0061' < u'\ud800\udc02')
0196 
0197             # Non surrogate above surrogate value, fixup required
0198             def test_lecmp(s, s2):
0199                 self.assert_(s < s2)
0200 
0201             def test_fixup(s):
0202                 s2 = u'\ud800\udc01'
0203                 test_lecmp(s, s2)
0204                 s2 = u'\ud900\udc01'
0205                 test_lecmp(s, s2)
0206                 s2 = u'\uda00\udc01'
0207                 test_lecmp(s, s2)
0208                 s2 = u'\udb00\udc01'
0209                 test_lecmp(s, s2)
0210                 s2 = u'\ud800\udd01'
0211                 test_lecmp(s, s2)
0212                 s2 = u'\ud900\udd01'
0213                 test_lecmp(s, s2)
0214                 s2 = u'\uda00\udd01'
0215                 test_lecmp(s, s2)
0216                 s2 = u'\udb00\udd01'
0217                 test_lecmp(s, s2)
0218                 s2 = u'\ud800\ude01'
0219                 test_lecmp(s, s2)
0220                 s2 = u'\ud900\ude01'
0221                 test_lecmp(s, s2)
0222                 s2 = u'\uda00\ude01'
0223                 test_lecmp(s, s2)
0224                 s2 = u'\udb00\ude01'
0225                 test_lecmp(s, s2)
0226                 s2 = u'\ud800\udfff'
0227                 test_lecmp(s, s2)
0228                 s2 = u'\ud900\udfff'
0229                 test_lecmp(s, s2)
0230                 s2 = u'\uda00\udfff'
0231                 test_lecmp(s, s2)
0232                 s2 = u'\udb00\udfff'
0233                 test_lecmp(s, s2)
0234 
0235                 test_fixup(u'\ue000')
0236                 test_fixup(u'\uff61')
0237 
0238         # Surrogates on both sides, no fixup required
0239         self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
0240 
0241     def test_islower(self):
0242         string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
0243         self.checkequalnofix(False, u'\u1FFc', 'islower')
0244 
0245     def test_isupper(self):
0246         string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
0247         if not sys.platform.startswith('java'):
0248             self.checkequalnofix(False, u'\u1FFc', 'isupper')
0249 
0250     def test_istitle(self):
0251         string_tests.MixinStrUnicodeUserStringTest.test_title(self)
0252         self.checkequalnofix(True, u'\u1FFc', 'istitle')
0253         self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
0254 
0255     def test_isspace(self):
0256         string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
0257         self.checkequalnofix(True, u'\u2000', 'isspace')
0258         self.checkequalnofix(True, u'\u200a', 'isspace')
0259         self.checkequalnofix(False, u'\u2014', 'isspace')
0260 
0261     def test_isalpha(self):
0262         string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
0263         self.checkequalnofix(True, u'\u1FFc', 'isalpha')
0264 
0265     def test_isdecimal(self):
0266         self.checkequalnofix(False, u'', 'isdecimal')
0267         self.checkequalnofix(False, u'a', 'isdecimal')
0268         self.checkequalnofix(True, u'0', 'isdecimal')
0269         self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
0270         self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
0271         self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
0272         self.checkequalnofix(True, u'0123456789', 'isdecimal')
0273         self.checkequalnofix(False, u'0123456789a', 'isdecimal')
0274 
0275         self.checkraises(TypeError, 'abc', 'isdecimal', 42)
0276 
0277     def test_isdigit(self):
0278         string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
0279         self.checkequalnofix(True, u'\u2460', 'isdigit')
0280         self.checkequalnofix(False, u'\xbc', 'isdigit')
0281         self.checkequalnofix(True, u'\u0660', 'isdigit')
0282 
0283     def test_isnumeric(self):
0284         self.checkequalnofix(False, u'', 'isnumeric')
0285         self.checkequalnofix(False, u'a', 'isnumeric')
0286         self.checkequalnofix(True, u'0', 'isnumeric')
0287         self.checkequalnofix(True, u'\u2460', 'isnumeric')
0288         self.checkequalnofix(True, u'\xbc', 'isnumeric')
0289         self.checkequalnofix(True, u'\u0660', 'isnumeric')
0290         self.checkequalnofix(True, u'0123456789', 'isnumeric')
0291         self.checkequalnofix(False, u'0123456789a', 'isnumeric')
0292 
0293         self.assertRaises(TypeError, u"abc".isnumeric, 42)
0294 
0295     def test_contains(self):
0296         # Testing Unicode contains method
0297         self.assert_('a' in u'abdb')
0298         self.assert_('a' in u'bdab')
0299         self.assert_('a' in u'bdaba')
0300         self.assert_('a' in u'bdba')
0301         self.assert_('a' in u'bdba')
0302         self.assert_(u'a' in u'bdba')
0303         self.assert_(u'a' not in u'bdb')
0304         self.assert_(u'a' not in 'bdb')
0305         self.assert_(u'a' in 'bdba')
0306         self.assert_(u'a' in ('a',1,None))
0307         self.assert_(u'a' in (1,None,'a'))
0308         self.assert_(u'a' in (1,None,u'a'))
0309         self.assert_('a' in ('a',1,None))
0310         self.assert_('a' in (1,None,'a'))
0311         self.assert_('a' in (1,None,u'a'))
0312         self.assert_('a' not in ('x',1,u'y'))
0313         self.assert_('a' not in ('x',1,None))
0314         self.assert_(u'abcd' not in u'abcxxxx')
0315         self.assert_(u'ab' in u'abcd')
0316         self.assert_('ab' in u'abc')
0317         self.assert_(u'ab' in 'abc')
0318         self.assert_(u'ab' in (1,None,u'ab'))
0319         self.assert_(u'' in u'abc')
0320         self.assert_('' in u'abc')
0321 
0322         # If the following fails either
0323         # the contains operator does not propagate UnicodeErrors or
0324         # someone has changed the default encoding
0325         self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
0326 
0327         self.assert_(u'' in '')
0328         self.assert_('' in u'')
0329         self.assert_(u'' in u'')
0330         self.assert_(u'' in 'abc')
0331         self.assert_('' in u'abc')
0332         self.assert_(u'' in u'abc')
0333         self.assert_(u'\0' not in 'abc')
0334         self.assert_('\0' not in u'abc')
0335         self.assert_(u'\0' not in u'abc')
0336         self.assert_(u'\0' in '\0abc')
0337         self.assert_('\0' in u'\0abc')
0338         self.assert_(u'\0' in u'\0abc')
0339         self.assert_(u'\0' in 'abc\0')
0340         self.assert_('\0' in u'abc\0')
0341         self.assert_(u'\0' in u'abc\0')
0342         self.assert_(u'a' in '\0abc')
0343         self.assert_('a' in u'\0abc')
0344         self.assert_(u'a' in u'\0abc')
0345         self.assert_(u'asdf' in 'asdf')
0346         self.assert_('asdf' in u'asdf')
0347         self.assert_(u'asdf' in u'asdf')
0348         self.assert_(u'asdf' not in 'asd')
0349         self.assert_('asdf' not in u'asd')
0350         self.assert_(u'asdf' not in u'asd')
0351         self.assert_(u'asdf' not in '')
0352         self.assert_('asdf' not in u'')
0353         self.assert_(u'asdf' not in u'')
0354 
0355         self.assertRaises(TypeError, u"abc".__contains__)
0356 
0357     def test_formatting(self):
0358         string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
0359         # Testing Unicode formatting strings...
0360         self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
0361         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000,  3.00')
0362         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000,  3.00')
0363         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000,  3.50')
0364         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000,  3.57')
0365         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
0366         if not sys.platform.startswith('java'):
0367             self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
0368         self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
0369         self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
0370 
0371         self.assertEqual(u'%c' % 0x1234, u'\u1234')
0372         self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
0373 
0374         # formatting jobs delegated from the string implementation:
0375         self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
0376         self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
0377         self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
0378         self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
0379         self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123},  u'...abc...')
0380         self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
0381         self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
0382         self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
0383         self.assertEqual('...%s...' % u"abc", u'...abc...')
0384         self.assertEqual('%*s' % (5,u'abc',), u'  abc')
0385         self.assertEqual('%*s' % (-5,u'abc',), u'abc  ')
0386         self.assertEqual('%*.*s' % (5,2,u'abc',), u'   ab')
0387         self.assertEqual('%*.*s' % (5,3,u'abc',), u'  abc')
0388         self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10   abc')
0389         self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103   abc')
0390         self.assertEqual('%c' % u'a', u'a')
0391 
0392 
0393     def test_constructor(self):
0394         # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
0395 
0396         self.assertEqual(
0397             unicode(u'unicode remains unicode'),
0398             u'unicode remains unicode'
0399         )
0400 
0401         class UnicodeSubclass(unicode):
0402             pass
0403 
0404         self.assertEqual(
0405             unicode(UnicodeSubclass('unicode subclass becomes unicode')),
0406             u'unicode subclass becomes unicode'
0407         )
0408 
0409         self.assertEqual(
0410             unicode('strings are converted to unicode'),
0411             u'strings are converted to unicode'
0412         )
0413 
0414         class UnicodeCompat:
0415             def __init__(self, x):
0416                 self.x = x
0417             def __unicode__(self):
0418                 return self.x
0419 
0420         self.assertEqual(
0421             unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
0422             u'__unicode__ compatible objects are recognized')
0423 
0424         class StringCompat:
0425             def __init__(self, x):
0426                 self.x = x
0427             def __str__(self):
0428                 return self.x
0429 
0430         self.assertEqual(
0431             unicode(StringCompat('__str__ compatible objects are recognized')),
0432             u'__str__ compatible objects are recognized'
0433         )
0434 
0435         # unicode(obj) is compatible to str():
0436 
0437         o = StringCompat('unicode(obj) is compatible to str()')
0438         self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
0439         self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
0440 
0441         # %-formatting and .__unicode__()
0442         self.assertEqual(u'%s' %
0443                          UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
0444                          u"u'%s' % obj uses obj.__unicode__()")
0445         self.assertEqual(u'%s' %
0446                          UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
0447                          u"u'%s' % obj falls back to obj.__str__()")
0448 
0449         for obj in (123, 123.45, 123L):
0450             self.assertEqual(unicode(obj), unicode(str(obj)))
0451 
0452         # unicode(obj, encoding, error) tests (this maps to
0453         # PyUnicode_FromEncodedObject() at C level)
0454 
0455         if not sys.platform.startswith('java'):
0456             self.assertRaises(
0457                 TypeError,
0458                 unicode,
0459                 u'decoding unicode is not supported',
0460                 'utf-8',
0461                 'strict'
0462             )
0463 
0464         self.assertEqual(
0465             unicode('strings are decoded to unicode', 'utf-8', 'strict'),
0466             u'strings are decoded to unicode'
0467         )
0468 
0469         if not sys.platform.startswith('java'):
0470             self.assertEqual(
0471                 unicode(
0472                     buffer('character buffers are decoded to unicode'),
0473                     'utf-8',
0474                     'strict'
0475                 ),
0476                 u'character buffers are decoded to unicode'
0477             )
0478 
0479         self.assertRaises(TypeError, unicode, 42, 42, 42)
0480 
0481     def test_codecs_utf7(self):
0482         utfTests = [
0483             (u'A\u2262\u0391.', 'A+ImIDkQ.'),             # RFC2152 example
0484             (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'),     # RFC2152 example
0485             (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'),        # RFC2152 example
0486             (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
0487             (u'+', '+-'),
0488             (u'+-', '+--'),
0489             (u'+?', '+-?'),
0490             (u'\?', '+AFw?'),
0491             (u'+?', '+-?'),
0492             (ur'\\?', '+AFwAXA?'),
0493             (ur'\\\?', '+AFwAXABc?'),
0494             (ur'++--', '+-+---')
0495         ]
0496 
0497         for (x, y) in utfTests:
0498             self.assertEqual(x.encode('utf-7'), y)
0499 
0500         # surrogates not supported
0501         self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
0502 
0503         self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
0504 
0505     def test_codecs_utf8(self):
0506         self.assertEqual(u''.encode('utf-8'), '')
0507         self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
0508         self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
0509         self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
0510         self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
0511         self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
0512         self.assertEqual(
0513             (u'\ud800\udc02'*1000).encode('utf-8'),
0514             '\xf0\x90\x80\x82'*1000
0515         )
0516         self.assertEqual(
0517             u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
0518             u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
0519             u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
0520             u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
0521             u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
0522             u' Nunstuck git und'.encode('utf-8'),
0523             '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
0524             '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
0525             '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
0526             '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
0527             '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
0528             '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
0529             '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
0530             '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
0531             '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
0532             '\xe3\x80\x8cWenn ist das Nunstuck git und'
0533         )
0534 
0535         # UTF-8 specific decoding tests
0536         self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
0537         self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
0538         self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
0539 
0540         # Other possible utf-8 test cases:
0541         # * strict decoding testing for all of the
0542         #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
0543 
0544     def test_codecs_idna(self):
0545         # Test whether trailing dot is preserved
0546         self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
0547 
0548     def test_codecs_errors(self):
0549         # Error handling (encoding)
0550         self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
0551         self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
0552         self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
0553         self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
0554 
0555         # Error handling (decoding)
0556         self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
0557         self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
0558         self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
0559         self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
0560 
0561         # Error handling (unknown character names)
0562         self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
0563 
0564         # Error handling (truncated escape sequence)
0565         self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
0566 
0567         # Error handling (bad decoder return)
0568         def search_function(encoding):
0569             def decode1(input, errors="strict"):
0570                 return 42 # not a tuple
0571             def encode1(input, errors="strict"):
0572                 return 42 # not a tuple
0573             def encode2(input, errors="strict"):
0574                 return (42, 42) # no unicode
0575             def decode2(input, errors="strict"):
0576                 return (42, 42) # no unicode
0577             if encoding=="test.unicode1":
0578                 return (encode1, decode1, None, None)
0579             elif encoding=="test.unicode2":
0580                 return (encode2, decode2, None, None)
0581             else:
0582                 return None
0583         codecs.register(search_function)
0584         self.assertRaises(TypeError, "hello".decode, "test.unicode1")
0585         self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
0586         self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
0587         self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
0588         # executes PyUnicode_Encode()
0589         import imp
0590         self.assertRaises(
0591             ImportError,
0592             imp.find_module,
0593             "non-existing module",
0594             [u"non-existing dir"]
0595         )
0596 
0597         # Error handling (wrong arguments)
0598         self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
0599 
0600         # Error handling (PyUnicode_EncodeDecimal())
0601         self.assertRaises(UnicodeError, int, u"\u0200")
0602 
0603     def test_codecs(self):
0604         # Encoding
0605         self.assertEqual(u'hello'.encode('ascii'), 'hello')
0606         self.assertEqual(u'hello'.encode('utf-7'), 'hello')
0607         self.assertEqual(u'hello'.encode('utf-8'), 'hello')
0608         self.assertEqual(u'hello'.encode('utf8'), 'hello')
0609         self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
0610         self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
0611         self.assertEqual(u'hello'.encode('latin-1'), 'hello')
0612 
0613         # Roundtrip safety for BMP (just the first 1024 chars)
0614         u = u''.join(map(unichr, xrange(1024)))
0615         for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
0616                          'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
0617             self.assertEqual(unicode(u.encode(encoding),encoding), u)
0618 
0619         # Roundtrip safety for BMP (just the first 256 chars)
0620         u = u''.join(map(unichr, xrange(256)))
0621         for encoding in ('latin-1',):
0622             self.assertEqual(unicode(u.encode(encoding),encoding), u)
0623 
0624         # Roundtrip safety for BMP (just the first 128 chars)
0625         u = u''.join(map(unichr, xrange(128)))
0626         for encoding in ('ascii',):
0627             self.assertEqual(unicode(u.encode(encoding),encoding), u)
0628 
0629         # Roundtrip safety for non-BMP (just a few chars)
0630         u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
0631         for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
0632                          #'raw_unicode_escape',
0633                          'unicode_escape', 'unicode_internal'):
0634             self.assertEqual(unicode(u.encode(encoding),encoding), u)
0635 
0636         # UTF-8 must be roundtrip safe for all UCS-2 code points
0637         # This excludes surrogates: in the full range, there would be
0638         # a surrogate pair (\udbff\udc00), which gets converted back
0639         # to a non-BMP character (\U0010fc00)
0640         u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
0641         for encoding in ('utf-8',):
0642             self.assertEqual(unicode(u.encode(encoding),encoding), u)
0643 
0644     def test_codecs_charmap(self):
0645         # 0-127
0646         s = ''.join(map(chr, xrange(128)))
0647         for encoding in (
0648             'cp037', 'cp1026',
0649             'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
0650             'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
0651             'cp863', 'cp865', 'cp866',
0652             'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
0653             'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
0654             'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
0655             'mac_cyrillic', 'mac_latin2',
0656 
0657             'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
0658             'cp1256', 'cp1257', 'cp1258',
0659             'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
0660 
0661             'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
0662             'cp1006', 'iso8859_8',
0663 
0664             ### These have undefined mappings:
0665             #'cp424',
0666 
0667             ### These fail the round-trip:
0668             #'cp875'
0669 
0670             ):
0671             self.assertEqual(unicode(s, encoding).encode(encoding), s)
0672 
0673         # 128-255
0674         s = ''.join(map(chr, xrange(128, 256)))
0675         for encoding in (
0676             'cp037', 'cp1026',
0677             'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
0678             'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
0679             'cp863', 'cp865', 'cp866',
0680             'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
0681             'iso8859_2', 'iso8859_4', 'iso8859_5',
0682             'iso8859_9', 'koi8_r', 'latin_1',
0683             'mac_cyrillic', 'mac_latin2',
0684 
0685             ### These have undefined mappings:
0686             #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
0687             #'cp1256', 'cp1257', 'cp1258',
0688             #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
0689             #'iso8859_3', 'iso8859_6', 'iso8859_7',
0690             #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
0691 
0692             ### These fail the round-trip:
0693             #'cp1006', 'cp875', 'iso8859_8',
0694 
0695             ):
0696             self.assertEqual(unicode(s, encoding).encode(encoding), s)
0697 
0698     def test_concatenation(self):
0699         self.assertEqual((u"abc" u"def"), u"abcdef")
0700         self.assertEqual(("abc" u"def"), u"abcdef")
0701         self.assertEqual((u"abc" "def"), u"abcdef")
0702         self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
0703         self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
0704 
0705     def test_printing(self):
0706         class BitBucket:
0707             def write(self, text):
0708                 pass
0709 
0710         out = BitBucket()
0711         print >>out, u'abc'
0712         print >>out, u'abc', u'def'
0713         print >>out, u'abc', 'def'
0714         print >>out, 'abc', u'def'
0715         print >>out, u'abc\n'
0716         print >>out, u'abc\n',
0717         print >>out, u'abc\n',
0718         print >>out, u'def\n'
0719         print >>out, u'def\n'
0720 
0721     def test_ucs4(self):
0722         if sys.maxunicode == 0xFFFF:
0723             return
0724         x = u'\U00100000'
0725         y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
0726         self.assertEqual(x, y)
0727 
0728 def test_main():
0729     test_support.run_unittest(UnicodeTest)
0730 
0731 if __name__ == "__main__":
0732     test_main()
0733 

Generated by PyXR 0.9.4
SourceForge.net Logo