gettext.py
changeset 0 b97547f5f1fa
child 1138 22f634977c95
equal deleted inserted replaced
-1:000000000000 0:b97547f5f1fa
       
     1 """Internationalization and localization support.
       
     2 
       
     3 This module provides internationalization (I18N) and localization (L10N)
       
     4 support for your Python programs by providing an interface to the GNU gettext
       
     5 message catalog library.
       
     6 
       
     7 I18N refers to the operation by which a program is made aware of multiple
       
     8 languages.  L10N refers to the adaptation of your program, once
       
     9 internationalized, to the local language and cultural habits.
       
    10 
       
    11 """
       
    12 
       
    13 # This module represents the integration of work, contributions, feedback, and
       
    14 # suggestions from the following people:
       
    15 #
       
    16 # Martin von Loewis, who wrote the initial implementation of the underlying
       
    17 # C-based libintlmodule (later renamed _gettext), along with a skeletal
       
    18 # gettext.py implementation.
       
    19 #
       
    20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
       
    21 # which also included a pure-Python implementation to read .mo files if
       
    22 # intlmodule wasn't available.
       
    23 #
       
    24 # James Henstridge, who also wrote a gettext.py module, which has some
       
    25 # interesting, but currently unsupported experimental features: the notion of
       
    26 # a Catalog class and instances, and the ability to add to a catalog file via
       
    27 # a Python API.
       
    28 #
       
    29 # Barry Warsaw integrated these modules, wrote the .install() API and code,
       
    30 # and conformed all C and Python code to Python's coding standards.
       
    31 #
       
    32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
       
    33 # module.
       
    34 #
       
    35 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
       
    36 #
       
    37 # TODO:
       
    38 # - Lazy loading of .mo files.  Currently the entire catalog is loaded into
       
    39 #   memory, but that's probably bad for large translated programs.  Instead,
       
    40 #   the lexical sort of original strings in GNU .mo files should be exploited
       
    41 #   to do binary searches and lazy initializations.  Or you might want to use
       
    42 #   the undocumented double-hash algorithm for .mo files with hash tables, but
       
    43 #   you'll need to study the GNU gettext code to do this.
       
    44 #
       
    45 # - Support Solaris .mo file formats.  Unfortunately, we've been unable to
       
    46 #   find this format documented anywhere.
       
    47 
       
    48 
       
    49 import copy, os, re, struct, sys
       
    50 from errno import ENOENT
       
    51 
       
    52 
       
    53 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
       
    54            'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
       
    55            'dgettext', 'dngettext', 'gettext', 'ngettext',
       
    56            ]
       
    57 
       
    58 _default_localedir = os.path.join(sys.prefix, 'share', 'locale')
       
    59 
       
    60 
       
    61 def test(condition, true, false):
       
    62     """
       
    63     Implements the C expression:
       
    64 
       
    65       condition ? true : false
       
    66 
       
    67     Required to correctly interpret plural forms.
       
    68     """
       
    69     if condition:
       
    70         return true
       
    71     else:
       
    72         return false
       
    73 
       
    74 
       
    75 def c2py(plural):
       
    76     """Gets a C expression as used in PO files for plural forms and returns a
       
    77     Python lambda function that implements an equivalent expression.
       
    78     """
       
    79     # Security check, allow only the "n" identifier
       
    80     from StringIO import StringIO
       
    81     import token, tokenize
       
    82     tokens = tokenize.generate_tokens(StringIO(plural).readline)
       
    83     try:
       
    84         danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
       
    85     except tokenize.TokenError:
       
    86         raise ValueError, \
       
    87               'plural forms expression error, maybe unbalanced parenthesis'
       
    88     else:
       
    89         if danger:
       
    90             raise ValueError, 'plural forms expression could be dangerous'
       
    91 
       
    92     # Replace some C operators by their Python equivalents
       
    93     plural = plural.replace('&&', ' and ')
       
    94     plural = plural.replace('||', ' or ')
       
    95 
       
    96     expr = re.compile(r'\!([^=])')
       
    97     plural = expr.sub(' not \\1', plural)
       
    98 
       
    99     # Regular expression and replacement function used to transform
       
   100     # "a?b:c" to "test(a,b,c)".
       
   101     expr = re.compile(r'(.*?)\?(.*?):(.*)')
       
   102     def repl(x):
       
   103         return "test(%s, %s, %s)" % (x.group(1), x.group(2),
       
   104                                      expr.sub(repl, x.group(3)))
       
   105 
       
   106     # Code to transform the plural expression, taking care of parentheses
       
   107     stack = ['']
       
   108     for c in plural:
       
   109         if c == '(':
       
   110             stack.append('')
       
   111         elif c == ')':
       
   112             if len(stack) == 1:
       
   113                 # Actually, we never reach this code, because unbalanced
       
   114                 # parentheses get caught in the security check at the
       
   115                 # beginning.
       
   116                 raise ValueError, 'unbalanced parenthesis in plural form'
       
   117             s = expr.sub(repl, stack.pop())
       
   118             stack[-1] += '(%s)' % s
       
   119         else:
       
   120             stack[-1] += c
       
   121     plural = expr.sub(repl, stack.pop())
       
   122 
       
   123     return eval('lambda n: int(%s)' % plural)
       
   124 
       
   125 
       
   126 
       
   127 def _expand_lang(locale):
       
   128     from locale import normalize
       
   129     locale = normalize(locale)
       
   130     COMPONENT_CODESET   = 1 << 0
       
   131     COMPONENT_TERRITORY = 1 << 1
       
   132     COMPONENT_MODIFIER  = 1 << 2
       
   133     # split up the locale into its base components
       
   134     mask = 0
       
   135     pos = locale.find('@')
       
   136     if pos >= 0:
       
   137         modifier = locale[pos:]
       
   138         locale = locale[:pos]
       
   139         mask |= COMPONENT_MODIFIER
       
   140     else:
       
   141         modifier = ''
       
   142     pos = locale.find('.')
       
   143     if pos >= 0:
       
   144         codeset = locale[pos:]
       
   145         locale = locale[:pos]
       
   146         mask |= COMPONENT_CODESET
       
   147     else:
       
   148         codeset = ''
       
   149     pos = locale.find('_')
       
   150     if pos >= 0:
       
   151         territory = locale[pos:]
       
   152         locale = locale[:pos]
       
   153         mask |= COMPONENT_TERRITORY
       
   154     else:
       
   155         territory = ''
       
   156     language = locale
       
   157     ret = []
       
   158     for i in range(mask+1):
       
   159         if not (i & ~mask):  # if all components for this combo exist ...
       
   160             val = language
       
   161             if i & COMPONENT_TERRITORY: val += territory
       
   162             if i & COMPONENT_CODESET:   val += codeset
       
   163             if i & COMPONENT_MODIFIER:  val += modifier
       
   164             ret.append(val)
       
   165     ret.reverse()
       
   166     return ret
       
   167 
       
   168 
       
   169 
       
   170 class NullTranslations:
       
   171     def __init__(self, fp=None):
       
   172         self._info = {}
       
   173         self._charset = None
       
   174         self._fallback = None
       
   175         if fp is not None:
       
   176             self._parse(fp)
       
   177 
       
   178     def _parse(self, fp):
       
   179         pass
       
   180 
       
   181     def add_fallback(self, fallback):
       
   182         if self._fallback:
       
   183             self._fallback.add_fallback(fallback)
       
   184         else:
       
   185             self._fallback = fallback
       
   186 
       
   187     def gettext(self, message):
       
   188         if self._fallback:
       
   189             return self._fallback.gettext(message)
       
   190         return message
       
   191 
       
   192     def ngettext(self, msgid1, msgid2, n):
       
   193         if self._fallback:
       
   194             return self._fallback.ngettext(msgid1, msgid2, n)
       
   195         if n == 1:
       
   196             return msgid1
       
   197         else:
       
   198             return msgid2
       
   199 
       
   200     def ugettext(self, message):
       
   201         if self._fallback:
       
   202             return self._fallback.ugettext(message)
       
   203         return unicode(message)
       
   204 
       
   205     def ungettext(self, msgid1, msgid2, n):
       
   206         if self._fallback:
       
   207             return self._fallback.ungettext(msgid1, msgid2, n)
       
   208         if n == 1:
       
   209             return unicode(msgid1)
       
   210         else:
       
   211             return unicode(msgid2)
       
   212 
       
   213     def info(self):
       
   214         return self._info
       
   215 
       
   216     def charset(self):
       
   217         return self._charset
       
   218 
       
   219     def install(self, unicode=False):
       
   220         import __builtin__
       
   221         __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext
       
   222 
       
   223 
       
   224 class GNUTranslations(NullTranslations):
       
   225     # Magic number of .mo files
       
   226     LE_MAGIC = 0x950412deL
       
   227     BE_MAGIC = 0xde120495L
       
   228 
       
   229     def _parse(self, fp):
       
   230         """Override this method to support alternative .mo formats."""
       
   231         unpack = struct.unpack
       
   232         filename = getattr(fp, 'name', '')
       
   233         # Parse the .mo file header, which consists of 5 little endian 32
       
   234         # bit words.
       
   235         self._catalog = catalog = {}
       
   236         self.plural = lambda n: int(n != 1) # germanic plural by default
       
   237         buf = fp.read()
       
   238         buflen = len(buf)
       
   239         # Are we big endian or little endian?
       
   240         magic = unpack('<I', buf[:4])[0]
       
   241         if magic == self.LE_MAGIC:
       
   242             version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
       
   243             ii = '<II'
       
   244         elif magic == self.BE_MAGIC:
       
   245             version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
       
   246             ii = '>II'
       
   247         else:
       
   248             raise IOError(0, 'Bad magic number', filename)
       
   249         # Now put all messages from the .mo file buffer into the catalog
       
   250         # dictionary.
       
   251         for i in xrange(0, msgcount):
       
   252             mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
       
   253             mend = moff + mlen
       
   254             tlen, toff = unpack(ii, buf[transidx:transidx+8])
       
   255             tend = toff + tlen
       
   256             if mend < buflen and tend < buflen:
       
   257                 msg = buf[moff:mend]
       
   258                 tmsg = buf[toff:tend]
       
   259             else:
       
   260                 raise IOError(0, 'File is corrupt', filename)
       
   261             # See if we're looking at GNU .mo conventions for metadata
       
   262             if mlen == 0:
       
   263                 # Catalog description
       
   264                 # don't handle multi-lines fields here, and skip
       
   265                 # lines which don't look like a header description
       
   266                 # (e.g. "header: value")
       
   267                 lastk = k = None
       
   268                 for item in tmsg.splitlines():
       
   269                     item = item.strip()
       
   270                     if not item or not ':' in item:
       
   271                         continue
       
   272                     k, v = item.split(':', 1)
       
   273                     k = k.strip().lower()
       
   274                     v = v.strip()
       
   275                     self._info[k] = v
       
   276                     if k == 'content-type':
       
   277                         self._charset = v.split('charset=')[1]
       
   278                     elif k == 'plural-forms':
       
   279                         v = v.split(';')
       
   280                         plural = v[1].split('plural=')[1]
       
   281                         self.plural = c2py(plural)
       
   282             # Note: we unconditionally convert both msgids and msgstrs to
       
   283             # Unicode using the character encoding specified in the charset
       
   284             # parameter of the Content-Type header.  The gettext documentation
       
   285             # strongly encourages msgids to be us-ascii, but some appliations
       
   286             # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
       
   287             # traditional gettext applications, the msgid conversion will
       
   288             # cause no problems since us-ascii should always be a subset of
       
   289             # the charset encoding.  We may want to fall back to 8-bit msgids
       
   290             # if the Unicode conversion fails.
       
   291             if msg.find('\x00') >= 0:
       
   292                 # Plural forms
       
   293                 msgid1, msgid2 = msg.split('\x00')
       
   294                 tmsg = tmsg.split('\x00')
       
   295                 if self._charset:
       
   296                     msgid1 = unicode(msgid1, self._charset)
       
   297                     tmsg = [unicode(x, self._charset) for x in tmsg]
       
   298                 for i in range(len(tmsg)):
       
   299                     catalog[(msgid1, i)] = tmsg[i]
       
   300             else:
       
   301                 if self._charset:
       
   302                     msg = unicode(msg, self._charset)
       
   303                     tmsg = unicode(tmsg, self._charset)
       
   304                 catalog[msg] = tmsg
       
   305             # advance to next entry in the seek tables
       
   306             masteridx += 8
       
   307             transidx += 8
       
   308 
       
   309     def gettext(self, message):
       
   310         missing = object()
       
   311         tmsg = self._catalog.get(message, missing)
       
   312         if tmsg is missing:
       
   313             if self._fallback:
       
   314                 return self._fallback.gettext(message)
       
   315             return message
       
   316         # Encode the Unicode tmsg back to an 8-bit string, if possible
       
   317         if self._charset:
       
   318             return tmsg.encode(self._charset)
       
   319         return tmsg
       
   320 
       
   321     def ngettext(self, msgid1, msgid2, n):
       
   322         try:
       
   323             tmsg = self._catalog[(msgid1, self.plural(n))]
       
   324             if self._charset:
       
   325                 return tmsg.encode(self._charset)
       
   326             return tmsg
       
   327         except KeyError:
       
   328             if self._fallback:
       
   329                 return self._fallback.ngettext(msgid1, msgid2, n)
       
   330             if n == 1:
       
   331                 return msgid1
       
   332             else:
       
   333                 return msgid2
       
   334 
       
   335     def ugettext(self, message):
       
   336         missing = object()
       
   337         tmsg = self._catalog.get(message, missing)
       
   338         if tmsg is missing:
       
   339             if self._fallback:
       
   340                 return self._fallback.ugettext(message)
       
   341             return unicode(message)
       
   342         return tmsg
       
   343 
       
   344     def ungettext(self, msgid1, msgid2, n):
       
   345         try:
       
   346             tmsg = self._catalog[(msgid1, self.plural(n))]
       
   347         except KeyError:
       
   348             if self._fallback:
       
   349                 return self._fallback.ungettext(msgid1, msgid2, n)
       
   350             if n == 1:
       
   351                 tmsg = unicode(msgid1)
       
   352             else:
       
   353                 tmsg = unicode(msgid2)
       
   354         return tmsg
       
   355 
       
   356 
       
   357 # Locate a .mo file using the gettext strategy
       
   358 def find(domain, localedir=None, languages=None, all=0):
       
   359     # Get some reasonable defaults for arguments that were not supplied
       
   360     if localedir is None:
       
   361         localedir = _default_localedir
       
   362     if languages is None:
       
   363         languages = []
       
   364         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
       
   365             val = os.environ.get(envar)
       
   366             if val:
       
   367                 languages = val.split(':')
       
   368                 break
       
   369         if 'C' not in languages:
       
   370             languages.append('C')
       
   371     # now normalize and expand the languages
       
   372     nelangs = []
       
   373     for lang in languages:
       
   374         for nelang in _expand_lang(lang):
       
   375             if nelang not in nelangs:
       
   376                 nelangs.append(nelang)
       
   377     # select a language
       
   378     if all:
       
   379         result = []
       
   380     else:
       
   381         result = None
       
   382     for lang in nelangs:
       
   383         if lang == 'C':
       
   384             break
       
   385         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
       
   386         if os.path.exists(mofile):
       
   387             if all:
       
   388                 result.append(mofile)
       
   389             else:
       
   390                 return mofile
       
   391     return result
       
   392 
       
   393 
       
   394 
       
   395 # a mapping between absolute .mo file path and Translation object
       
   396 _translations = {}
       
   397 
       
   398 def translation(domain, localedir=None, languages=None,
       
   399                 class_=None, fallback=False):
       
   400     if class_ is None:
       
   401         class_ = GNUTranslations
       
   402     mofiles = find(domain, localedir, languages, all=1)
       
   403     if not mofiles:
       
   404         if fallback:
       
   405             return NullTranslations()
       
   406         raise IOError(ENOENT, 'No translation file found for domain', domain)
       
   407     # TBD: do we need to worry about the file pointer getting collected?
       
   408     # Avoid opening, reading, and parsing the .mo file after it's been done
       
   409     # once.
       
   410     result = None
       
   411     for mofile in mofiles:
       
   412         key = os.path.abspath(mofile)
       
   413         t = _translations.get(key)
       
   414         if t is None:
       
   415             t = _translations.setdefault(key, class_(open(mofile, 'rb')))
       
   416         # Copy the translation object to allow setting fallbacks.
       
   417         # All other instance data is shared with the cached object.
       
   418         t = copy.copy(t)
       
   419         if result is None:
       
   420             result = t
       
   421         else:
       
   422             result.add_fallback(t)
       
   423     return result
       
   424 
       
   425 
       
   426 def install(domain, localedir=None, unicode=False):
       
   427     translation(domain, localedir, fallback=True).install(unicode)
       
   428 
       
   429 
       
   430 
       
   431 # a mapping b/w domains and locale directories
       
   432 _localedirs = {}
       
   433 # current global domain, `messages' used for compatibility w/ GNU gettext
       
   434 _current_domain = 'messages'
       
   435 
       
   436 
       
   437 def textdomain(domain=None):
       
   438     global _current_domain
       
   439     if domain is not None:
       
   440         _current_domain = domain
       
   441     return _current_domain
       
   442 
       
   443 
       
   444 def bindtextdomain(domain, localedir=None):
       
   445     global _localedirs
       
   446     if localedir is not None:
       
   447         _localedirs[domain] = localedir
       
   448     return _localedirs.get(domain, _default_localedir)
       
   449 
       
   450 
       
   451 def dgettext(domain, message):
       
   452     try:
       
   453         t = translation(domain, _localedirs.get(domain, None))
       
   454     except IOError:
       
   455         return message
       
   456     return t.gettext(message)
       
   457 
       
   458 
       
   459 def dngettext(domain, msgid1, msgid2, n):
       
   460     try:
       
   461         t = translation(domain, _localedirs.get(domain, None))
       
   462     except IOError:
       
   463         if n == 1:
       
   464             return msgid1
       
   465         else:
       
   466             return msgid2
       
   467     return t.ngettext(msgid1, msgid2, n)
       
   468 
       
   469 
       
   470 def gettext(message):
       
   471     return dgettext(_current_domain, message)
       
   472 
       
   473 
       
   474 def ngettext(msgid1, msgid2, n):
       
   475     return dngettext(_current_domain, msgid1, msgid2, n)
       
   476 
       
   477 
       
   478 # dcgettext() has been deemed unnecessary and is not implemented.
       
   479 
       
   480 # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
       
   481 # was:
       
   482 #
       
   483 #    import gettext
       
   484 #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
       
   485 #    _ = cat.gettext
       
   486 #    print _('Hello World')
       
   487 
       
   488 # The resulting catalog object currently don't support access through a
       
   489 # dictionary API, which was supported (but apparently unused) in GNOME
       
   490 # gettext.
       
   491 
       
   492 Catalog = translation