devtools/fix_po_encoding
author Julien Jehannet <julien.jehannet@logilab.fr>
Fri, 05 Feb 2010 17:13:53 +0100
changeset 4527 67ab70e98488
parent 0 b97547f5f1fa
permissions -rwxr-xr-x
[R] devtools: improve default data import mechanism Validation chain is now possible with checkers Before that the expected values needed to be coherent. Now, we can use ObjectStore to validate the input data * add new input transformers: - uppercase - lowercase * add new input checkers (raise AssertionError on error): - decimal: take care of possible comma character as number separator - integer: cast to int() - yesno: to validate boolean value - isalpha - required: input value *must* not be empty * new control checker: - optional: block possible exception we delete field in the returned dict instead of raising AssertionError (exclusive with required) Helper methods to manipulate indexes: * build_rqlindex() is used to build index based on already created entities * fetch() replace get_one()/get_many() methods by factorizing code Minor changes in reporting: * use tell() for all printing * let new value for askerrors to display automatically the report (used in crontab)

#!/usr/bin/python

"""usage: fix-po-encodings [filename...]
change the encoding of the po files passed as arguments to utf-8
"""
import sys
import re
import codecs

def change_encoding(filename, target='UTF-8'):
    fdesc = open(filename)
    data = fdesc.read()
    fdesc.close()
    encoding = find_encoding(data)
    if encoding == target:
        return
    data = fix_encoding(data, target)
    data = unicode(data, encoding)
    fdesc = codecs.open(filename, 'wb', encoding=target)
    fdesc.write(data)
    fdesc.close()

def find_encoding(data):
    regexp = re.compile(r'"Content-Type:.* charset=([a-zA-Z0-9-]+)\\n"', re.M)
    mo = regexp.search(data)
    if mo is None:
        raise ValueError('No encoding declaration')
    return mo.group(1)

def fix_encoding(data, target_encoding):
    regexp = re.compile(r'("Content-Type:.* charset=)(.*)(\\n")', re.M)
    return regexp.sub(r'\1%s\3' % target_encoding, data)
    


for filename in sys.argv[1:]:
    print filename
    change_encoding(filename)