PyXR

c:\python24\lib \ gettext.py


0001 """Internationalization and localization support.
0002 
0003 This module provides internationalization (I18N) and localization (L10N)
0004 support for your Python programs by providing an interface to the GNU gettext
0005 message catalog library.
0006 
0007 I18N refers to the operation by which a program is made aware of multiple
0008 languages.  L10N refers to the adaptation of your program, once
0009 internationalized, to the local language and cultural habits.
0010 
0011 """
0012 
0013 # This module represents the integration of work, contributions, feedback, and
0014 # suggestions from the following people:
0015 #
0016 # Martin von Loewis, who wrote the initial implementation of the underlying
0017 # C-based libintlmodule (later renamed _gettext), along with a skeletal
0018 # gettext.py implementation.
0019 #
0020 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
0021 # which also included a pure-Python implementation to read .mo files if
0022 # intlmodule wasn't available.
0023 #
0024 # James Henstridge, who also wrote a gettext.py module, which has some
0025 # interesting, but currently unsupported experimental features: the notion of
0026 # a Catalog class and instances, and the ability to add to a catalog file via
0027 # a Python API.
0028 #
0029 # Barry Warsaw integrated these modules, wrote the .install() API and code,
0030 # and conformed all C and Python code to Python's coding standards.
0031 #
0032 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
0033 # module.
0034 #
0035 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
0036 #
0037 # TODO:
0038 # - Lazy loading of .mo files.  Currently the entire catalog is loaded into
0039 #   memory, but that's probably bad for large translated programs.  Instead,
0040 #   the lexical sort of original strings in GNU .mo files should be exploited
0041 #   to do binary searches and lazy initializations.  Or you might want to use
0042 #   the undocumented double-hash algorithm for .mo files with hash tables, but
0043 #   you'll need to study the GNU gettext code to do this.
0044 #
0045 # - Support Solaris .mo file formats.  Unfortunately, we've been unable to
0046 #   find this format documented anywhere.
0047 
0048 
0049 import locale, copy, os, re, struct, sys
0050 from errno import ENOENT
0051 
0052 
0053 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
0054            'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
0055            'dgettext', 'dngettext', 'gettext', 'ngettext',
0056            ]
0057 
0058 _default_localedir = os.path.join(sys.prefix, 'share', 'locale')
0059 
0060 
0061 def test(condition, true, false):
0062     """
0063     Implements the C expression:
0064 
0065       condition ? true : false
0066 
0067     Required to correctly interpret plural forms.
0068     """
0069     if condition:
0070         return true
0071     else:
0072         return false
0073 
0074 
0075 def c2py(plural):
0076     """Gets a C expression as used in PO files for plural forms and returns a
0077     Python lambda function that implements an equivalent expression.
0078     """
0079     # Security check, allow only the "n" identifier
0080     from StringIO import StringIO
0081     import token, tokenize
0082     tokens = tokenize.generate_tokens(StringIO(plural).readline)
0083     try:
0084         danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
0085     except tokenize.TokenError:
0086         raise ValueError, \
0087               'plural forms expression error, maybe unbalanced parenthesis'
0088     else:
0089         if danger:
0090             raise ValueError, 'plural forms expression could be dangerous'
0091 
0092     # Replace some C operators by their Python equivalents
0093     plural = plural.replace('&&', ' and ')
0094     plural = plural.replace('||', ' or ')
0095 
0096     expr = re.compile(r'\!([^=])')
0097     plural = expr.sub(' not \\1', plural)
0098 
0099     # Regular expression and replacement function used to transform
0100     # "a?b:c" to "test(a,b,c)".
0101     expr = re.compile(r'(.*?)\?(.*?):(.*)')
0102     def repl(x):
0103         return "test(%s, %s, %s)" % (x.group(1), x.group(2),
0104                                      expr.sub(repl, x.group(3)))
0105 
0106     # Code to transform the plural expression, taking care of parentheses
0107     stack = ['']
0108     for c in plural:
0109         if c == '(':
0110             stack.append('')
0111         elif c == ')':
0112             if len(stack) == 1:
0113                 # Actually, we never reach this code, because unbalanced
0114                 # parentheses get caught in the security check at the
0115                 # beginning.
0116                 raise ValueError, 'unbalanced parenthesis in plural form'
0117             s = expr.sub(repl, stack.pop())
0118             stack[-1] += '(%s)' % s
0119         else:
0120             stack[-1] += c
0121     plural = expr.sub(repl, stack.pop())
0122 
0123     return eval('lambda n: int(%s)' % plural)
0124 
0125 
0126 
0127 def _expand_lang(locale):
0128     from locale import normalize
0129     locale = normalize(locale)
0130     COMPONENT_CODESET   = 1 << 0
0131     COMPONENT_TERRITORY = 1 << 1
0132     COMPONENT_MODIFIER  = 1 << 2
0133     # split up the locale into its base components
0134     mask = 0
0135     pos = locale.find('@')
0136     if pos >= 0:
0137         modifier = locale[pos:]
0138         locale = locale[:pos]
0139         mask |= COMPONENT_MODIFIER
0140     else:
0141         modifier = ''
0142     pos = locale.find('.')
0143     if pos >= 0:
0144         codeset = locale[pos:]
0145         locale = locale[:pos]
0146         mask |= COMPONENT_CODESET
0147     else:
0148         codeset = ''
0149     pos = locale.find('_')
0150     if pos >= 0:
0151         territory = locale[pos:]
0152         locale = locale[:pos]
0153         mask |= COMPONENT_TERRITORY
0154     else:
0155         territory = ''
0156     language = locale
0157     ret = []
0158     for i in range(mask+1):
0159         if not (i & ~mask):  # if all components for this combo exist ...
0160             val = language
0161             if i & COMPONENT_TERRITORY: val += territory
0162             if i & COMPONENT_CODESET:   val += codeset
0163             if i & COMPONENT_MODIFIER:  val += modifier
0164             ret.append(val)
0165     ret.reverse()
0166     return ret
0167 
0168 
0169 
0170 class NullTranslations:
0171     def __init__(self, fp=None):
0172         self._info = {}
0173         self._charset = None
0174         self._output_charset = None
0175         self._fallback = None
0176         if fp is not None:
0177             self._parse(fp)
0178 
0179     def _parse(self, fp):
0180         pass
0181 
0182     def add_fallback(self, fallback):
0183         if self._fallback:
0184             self._fallback.add_fallback(fallback)
0185         else:
0186             self._fallback = fallback
0187 
0188     def gettext(self, message):
0189         if self._fallback:
0190             return self._fallback.gettext(message)
0191         return message
0192 
0193     def lgettext(self, message):
0194         if self._fallback:
0195             return self._fallback.lgettext(message)
0196         return message
0197 
0198     def ngettext(self, msgid1, msgid2, n):
0199         if self._fallback:
0200             return self._fallback.ngettext(msgid1, msgid2, n)
0201         if n == 1:
0202             return msgid1
0203         else:
0204             return msgid2
0205 
0206     def lngettext(self, msgid1, msgid2, n):
0207         if self._fallback:
0208             return self._fallback.lngettext(msgid1, msgid2, n)
0209         if n == 1:
0210             return msgid1
0211         else:
0212             return msgid2
0213 
0214     def ugettext(self, message):
0215         if self._fallback:
0216             return self._fallback.ugettext(message)
0217         return unicode(message)
0218 
0219     def ungettext(self, msgid1, msgid2, n):
0220         if self._fallback:
0221             return self._fallback.ungettext(msgid1, msgid2, n)
0222         if n == 1:
0223             return unicode(msgid1)
0224         else:
0225             return unicode(msgid2)
0226 
0227     def info(self):
0228         return self._info
0229 
0230     def charset(self):
0231         return self._charset
0232 
0233     def output_charset(self):
0234         return self._output_charset
0235 
0236     def set_output_charset(self, charset):
0237         self._output_charset = charset
0238 
0239     def install(self, unicode=False):
0240         import __builtin__
0241         __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext
0242 
0243 
0244 class GNUTranslations(NullTranslations):
0245     # Magic number of .mo files
0246     LE_MAGIC = 0x950412deL
0247     BE_MAGIC = 0xde120495L
0248 
0249     def _parse(self, fp):
0250         """Override this method to support alternative .mo formats."""
0251         unpack = struct.unpack
0252         filename = getattr(fp, 'name', '')
0253         # Parse the .mo file header, which consists of 5 little endian 32
0254         # bit words.
0255         self._catalog = catalog = {}
0256         self.plural = lambda n: int(n != 1) # germanic plural by default
0257         buf = fp.read()
0258         buflen = len(buf)
0259         # Are we big endian or little endian?
0260         magic = unpack('<I', buf[:4])[0]
0261         if magic == self.LE_MAGIC:
0262             version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
0263             ii = '<II'
0264         elif magic == self.BE_MAGIC:
0265             version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
0266             ii = '>II'
0267         else:
0268             raise IOError(0, 'Bad magic number', filename)
0269         # Now put all messages from the .mo file buffer into the catalog
0270         # dictionary.
0271         for i in xrange(0, msgcount):
0272             mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
0273             mend = moff + mlen
0274             tlen, toff = unpack(ii, buf[transidx:transidx+8])
0275             tend = toff + tlen
0276             if mend < buflen and tend < buflen:
0277                 msg = buf[moff:mend]
0278                 tmsg = buf[toff:tend]
0279             else:
0280                 raise IOError(0, 'File is corrupt', filename)
0281             # See if we're looking at GNU .mo conventions for metadata
0282             if mlen == 0:
0283                 # Catalog description
0284                 lastk = k = None
0285                 for item in tmsg.splitlines():
0286                     item = item.strip()
0287                     if not item:
0288                         continue
0289                     if ':' in item:
0290                         k, v = item.split(':', 1)
0291                         k = k.strip().lower()
0292                         v = v.strip()
0293                         self._info[k] = v
0294                         lastk = k
0295                     elif lastk:
0296                         self._info[lastk] += '\n' + item
0297                     if k == 'content-type':
0298                         self._charset = v.split('charset=')[1]
0299                     elif k == 'plural-forms':
0300                         v = v.split(';')
0301                         plural = v[1].split('plural=')[1]
0302                         self.plural = c2py(plural)
0303             # Note: we unconditionally convert both msgids and msgstrs to
0304             # Unicode using the character encoding specified in the charset
0305             # parameter of the Content-Type header.  The gettext documentation
0306             # strongly encourages msgids to be us-ascii, but some appliations
0307             # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
0308             # traditional gettext applications, the msgid conversion will
0309             # cause no problems since us-ascii should always be a subset of
0310             # the charset encoding.  We may want to fall back to 8-bit msgids
0311             # if the Unicode conversion fails.
0312             if '\x00' in msg:
0313                 # Plural forms
0314                 msgid1, msgid2 = msg.split('\x00')
0315                 tmsg = tmsg.split('\x00')
0316                 if self._charset:
0317                     msgid1 = unicode(msgid1, self._charset)
0318                     tmsg = [unicode(x, self._charset) for x in tmsg]
0319                 for i in range(len(tmsg)):
0320                     catalog[(msgid1, i)] = tmsg[i]
0321             else:
0322                 if self._charset:
0323                     msg = unicode(msg, self._charset)
0324                     tmsg = unicode(tmsg, self._charset)
0325                 catalog[msg] = tmsg
0326             # advance to next entry in the seek tables
0327             masteridx += 8
0328             transidx += 8
0329 
0330     def gettext(self, message):
0331         missing = object()
0332         tmsg = self._catalog.get(message, missing)
0333         if tmsg is missing:
0334             if self._fallback:
0335                 return self._fallback.gettext(message)
0336             return message
0337         # Encode the Unicode tmsg back to an 8-bit string, if possible
0338         if self._output_charset:
0339             return tmsg.encode(self._output_charset)
0340         elif self._charset:
0341             return tmsg.encode(self._charset)
0342         return tmsg
0343 
0344     def lgettext(self, message):
0345         missing = object()
0346         tmsg = self._catalog.get(message, missing)
0347         if tmsg is missing:
0348             if self._fallback:
0349                 return self._fallback.lgettext(message)
0350             return message
0351         if self._output_charset:
0352             return tmsg.encode(self._output_charset)
0353         return tmsg.encode(locale.getpreferredencoding())
0354 
0355     def ngettext(self, msgid1, msgid2, n):
0356         try:
0357             tmsg = self._catalog[(msgid1, self.plural(n))]
0358             if self._output_charset:
0359                 return tmsg.encode(self._output_charset)
0360             elif self._charset:
0361                 return tmsg.encode(self._charset)
0362             return tmsg
0363         except KeyError:
0364             if self._fallback:
0365                 return self._fallback.ngettext(msgid1, msgid2, n)
0366             if n == 1:
0367                 return msgid1
0368             else:
0369                 return msgid2
0370 
0371     def lngettext(self, msgid1, msgid2, n):
0372         try:
0373             tmsg = self._catalog[(msgid1, self.plural(n))]
0374             if self._output_charset:
0375                 return tmsg.encode(self._output_charset)
0376             return tmsg.encode(locale.getpreferredencoding())
0377         except KeyError:
0378             if self._fallback:
0379                 return self._fallback.lngettext(msgid1, msgid2, n)
0380             if n == 1:
0381                 return msgid1
0382             else:
0383                 return msgid2
0384 
0385     def ugettext(self, message):
0386         missing = object()
0387         tmsg = self._catalog.get(message, missing)
0388         if tmsg is missing:
0389             if self._fallback:
0390                 return self._fallback.ugettext(message)
0391             return unicode(message)
0392         return tmsg
0393 
0394     def ungettext(self, msgid1, msgid2, n):
0395         try:
0396             tmsg = self._catalog[(msgid1, self.plural(n))]
0397         except KeyError:
0398             if self._fallback:
0399                 return self._fallback.ungettext(msgid1, msgid2, n)
0400             if n == 1:
0401                 tmsg = unicode(msgid1)
0402             else:
0403                 tmsg = unicode(msgid2)
0404         return tmsg
0405 
0406 
0407 # Locate a .mo file using the gettext strategy
0408 def find(domain, localedir=None, languages=None, all=0):
0409     # Get some reasonable defaults for arguments that were not supplied
0410     if localedir is None:
0411         localedir = _default_localedir
0412     if languages is None:
0413         languages = []
0414         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
0415             val = os.environ.get(envar)
0416             if val:
0417                 languages = val.split(':')
0418                 break
0419         if 'C' not in languages:
0420             languages.append('C')
0421     # now normalize and expand the languages
0422     nelangs = []
0423     for lang in languages:
0424         for nelang in _expand_lang(lang):
0425             if nelang not in nelangs:
0426                 nelangs.append(nelang)
0427     # select a language
0428     if all:
0429         result = []
0430     else:
0431         result = None
0432     for lang in nelangs:
0433         if lang == 'C':
0434             break
0435         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
0436         if os.path.exists(mofile):
0437             if all:
0438                 result.append(mofile)
0439             else:
0440                 return mofile
0441     return result
0442 
0443 
0444 
0445 # a mapping between absolute .mo file path and Translation object
0446 _translations = {}
0447 
0448 def translation(domain, localedir=None, languages=None,
0449                 class_=None, fallback=False, codeset=None):
0450     if class_ is None:
0451         class_ = GNUTranslations
0452     mofiles = find(domain, localedir, languages, all=1)
0453     if not mofiles:
0454         if fallback:
0455             return NullTranslations()
0456         raise IOError(ENOENT, 'No translation file found for domain', domain)
0457     # TBD: do we need to worry about the file pointer getting collected?
0458     # Avoid opening, reading, and parsing the .mo file after it's been done
0459     # once.
0460     result = None
0461     for mofile in mofiles:
0462         key = os.path.abspath(mofile)
0463         t = _translations.get(key)
0464         if t is None:
0465             t = _translations.setdefault(key, class_(open(mofile, 'rb')))
0466         # Copy the translation object to allow setting fallbacks and
0467         # output charset. All other instance data is shared with the
0468         # cached object.
0469         t = copy.copy(t)
0470         if codeset:
0471             t.set_output_charset(codeset)
0472         if result is None:
0473             result = t
0474         else:
0475             result.add_fallback(t)
0476     return result
0477 
0478 
0479 def install(domain, localedir=None, unicode=False, codeset=None):
0480     t = translation(domain, localedir, fallback=True, codeset=codeset)
0481     t.install(unicode)
0482 
0483 
0484 
0485 # a mapping b/w domains and locale directories
0486 _localedirs = {}
0487 # a mapping b/w domains and codesets
0488 _localecodesets = {}
0489 # current global domain, `messages' used for compatibility w/ GNU gettext
0490 _current_domain = 'messages'
0491 
0492 
0493 def textdomain(domain=None):
0494     global _current_domain
0495     if domain is not None:
0496         _current_domain = domain
0497     return _current_domain
0498 
0499 
0500 def bindtextdomain(domain, localedir=None):
0501     global _localedirs
0502     if localedir is not None:
0503         _localedirs[domain] = localedir
0504     return _localedirs.get(domain, _default_localedir)
0505 
0506 
0507 def bind_textdomain_codeset(domain, codeset=None):
0508     global _localecodesets
0509     if codeset is not None:
0510         _localecodesets[domain] = codeset
0511     return _localecodesets.get(domain)
0512 
0513 
0514 def dgettext(domain, message):
0515     try:
0516         t = translation(domain, _localedirs.get(domain, None),
0517                         codeset=_localecodesets.get(domain))
0518     except IOError:
0519         return message
0520     return t.gettext(message)
0521 
0522 def ldgettext(domain, message):
0523     try:
0524         t = translation(domain, _localedirs.get(domain, None),
0525                         codeset=_localecodesets.get(domain))
0526     except IOError:
0527         return message
0528     return t.lgettext(message)
0529 
0530 def dngettext(domain, msgid1, msgid2, n):
0531     try:
0532         t = translation(domain, _localedirs.get(domain, None),
0533                         codeset=_localecodesets.get(domain))
0534     except IOError:
0535         if n == 1:
0536             return msgid1
0537         else:
0538             return msgid2
0539     return t.ngettext(msgid1, msgid2, n)
0540 
0541 def ldngettext(domain, msgid1, msgid2, n):
0542     try:
0543         t = translation(domain, _localedirs.get(domain, None),
0544                         codeset=_localecodesets.get(domain))
0545     except IOError:
0546         if n == 1:
0547             return msgid1
0548         else:
0549             return msgid2
0550     return t.lngettext(msgid1, msgid2, n)
0551 
0552 def gettext(message):
0553     return dgettext(_current_domain, message)
0554 
0555 def lgettext(message):
0556     return ldgettext(_current_domain, message)
0557 
0558 def ngettext(msgid1, msgid2, n):
0559     return dngettext(_current_domain, msgid1, msgid2, n)
0560 
0561 def lngettext(msgid1, msgid2, n):
0562     return ldngettext(_current_domain, msgid1, msgid2, n)
0563 
0564 # dcgettext() has been deemed unnecessary and is not implemented.
0565 
0566 # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
0567 # was:
0568 #
0569 #    import gettext
0570 #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
0571 #    _ = cat.gettext
0572 #    print _('Hello World')
0573 
0574 # The resulting catalog object currently don't support access through a
0575 # dictionary API, which was supported (but apparently unused) in GNOME
0576 # gettext.
0577 
0578 Catalog = translation
0579
Generated by PyXR 0.9.4